int pix_norm1_altivec(uint8_t *pix, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char *tv; vector unsigned char pixv; vector unsigned int sv; vector signed int sum; sv = (vector unsigned int)vec_splat_u32(0); s = 0; for (i = 0; i < 16; i++) { /* Read in the potentially unaligned pixels */ tv = (vector unsigned char *) pix; pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix)); /* Square the values, and add them to our sum */ sv = vec_msum(pixv, pixv, sv); pix += line_size; } /* Sum up the four partial sums, and put the result into s */ sum = vec_sums((vector signed int) sv, (vector signed int) zero); sum = vec_splat(sum, 3); vec_ste(sum, 0, &s); return s; }
static int pix_sum_altivec(uint8_t * pix, int line_size) { const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm = vec_lvsl(0, pix); vector unsigned char t1; vector unsigned int sad; vector signed int sumdiffs; int i; int s; sad = (vector unsigned int)vec_splat_u32(0); for (i = 0; i < 16; i++) { /* Read the potentially unaligned 16 pixels into t1 */ vector unsigned char pixl = vec_ld( 0, pix); vector unsigned char pixr = vec_ld(15, pix); t1 = vec_perm(pixl, pixr, perm); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t1, sad); pix += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
static int pix_norm1_altivec(uint8_t *pix, int line_size) { int i, s = 0; const vector unsigned int zero = (const vector unsigned int) vec_splat_u32(0); vector unsigned char perm = vec_lvsl(0, pix); vector unsigned int sv = (vector unsigned int) vec_splat_u32(0); vector signed int sum; for (i = 0; i < 16; i++) { /* Read the potentially unaligned pixels. */ vector unsigned char pixl = vec_ld(0, pix); vector unsigned char pixr = vec_ld(15, pix); vector unsigned char pixv = vec_perm(pixl, pixr, perm); /* Square the values, and add them to our sum. */ sv = vec_msum(pixv, pixv, sv); pix += line_size; } /* Sum up the four partial sums, and put the result into s. */ sum = vec_sums((vector signed int) sv, (vector signed int) zero); sum = vec_splat(sum, 3); vec_ste(sum, 0, &s); return s; }
void rgbaint_t::blend(const rgbaint_t& other, UINT8 factor) { const VECU32 shift = vec_splat_u32(-16); const VECS32 scale1 = { factor, factor, factor, factor }; const VECS32 scale2 = { 0x100 - factor, 0x100 - factor, 0x100 - factor, 0x100 - factor, }; VECU32 temp = vec_msum((VECU16)m_value, (VECU16)vec_rl(scale1, shift), vec_splat_u32(0)); temp = vec_msum((VECU16)other.m_value, (VECU16)vec_rl(scale2, shift), temp); m_value = vec_msum((VECU16)m_value, (VECU16)scale1, vec_mulo((VECU16)other.m_value, (VECU16)scale2)); m_value = vec_add(vec_sl(temp, shift), (VECU32)m_value); sra(8); }
/** * Sum of Squared Errors for a 8x8 block. * AltiVec-enhanced. * It's the pix_abs8x8_altivec code above w/ squaring added. */ int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sum; vector signed int sumsqr; sum = (vector unsigned int)vec_splat_u32(0); permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); for(i=0;i<8;i++) { /* Read potentially unaligned pixels into t1 and t2 Since we're reading 16 pixels, and actually only want 8, mask out the last 8 pixels. The 0s don't change the sum. */ perm1 = vec_lvsl(0, pix1); pix1v = (vector unsigned char *) pix1; perm2 = vec_lvsl(0, pix2); pix2v = (vector unsigned char *) pix2; t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); /* Since we want to use unsigned chars, we can take advantage of the fact that abs(a-b)^2 = (a-b)^2. */ /* Calculate abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Square the values and add them to our sum */ sum = vec_msum(t5, t5, sum); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); sumsqr = vec_splat(sumsqr, 3); vec_ste(sumsqr, 0, &s); return s; }
static inline vec_uint4 GENBX(vec_uint4 a, vec_uint4 b, vec_uint4 c) { return vec_and(vec_or(vec_cmpgt(a, b), vec_and(vec_cmpeq(a, b), c)), vec_splat_u32(1)); }
/* M(i,k) is reached from B(i-1), M(i-1,k-1), D(i-1,k-1), or I(i-1,k-1). */ static inline int select_m(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k) { int Q = p7O_NQF(ox->M); int q = (k-1) % Q; /* (q,r) is position of the current DP cell M(i,k) */ int r = (k-1) / Q; vector float *tp = om->tfv + 7*q; /* *tp now at start of transitions to cur cell M(i,k) */ vector float xBv; vector float zerov; vector float mpv, dpv, ipv; union { vector float v; float p[4]; } u; float path[4]; int state[4] = { p7T_B, p7T_M, p7T_I, p7T_D }; xBv = esl_vmx_set_float(ox->xmx[(i-1)*p7X_NXCELLS+p7X_B]); zerov = (vector float) vec_splat_u32(0); if (q > 0) { mpv = ox->dpf[i-1][(q-1)*3 + p7X_M]; dpv = ox->dpf[i-1][(q-1)*3 + p7X_D]; ipv = ox->dpf[i-1][(q-1)*3 + p7X_I]; } else { mpv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_M], 12); dpv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_D], 12); ipv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_I], 12); } u.v = vec_madd(xBv, *tp, zerov); tp++; path[0] = u.p[r]; u.v = vec_madd(mpv, *tp, zerov); tp++; path[1] = u.p[r]; u.v = vec_madd(ipv, *tp, zerov); tp++; path[2] = u.p[r]; u.v = vec_madd(dpv, *tp, zerov); path[3] = u.p[r]; esl_vec_FNorm(path, 4); return state[esl_rnd_FChoose(rng, path, 4)]; }
/* Using FChoose() here would mean allocating tmp space for 2M-1 paths; * instead we use the fact that E(i) is itself the necessary normalization * factor, and implement FChoose's algorithm here for an on-the-fly * calculation. */ static inline int select_e(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int *ret_k) { int Q = p7O_NQF(ox->M); double sum = 0.0; double roll = esl_random(rng); double norm = 1.0 / ox->xmx[i*p7X_NXCELLS+p7X_E]; /* all M, D already scaled exactly the same */ vector float xEv = esl_vmx_set_float(norm); vector float zerov = (vector float) vec_splat_u32(0); union { vector float v; float p[4]; } u; int q,r; while (1) { for (q = 0; q < Q; q++) { u.v = vec_madd(ox->dpf[i][q*3 + p7X_M], xEv, zerov); for (r = 0; r < 4; r++) { sum += u.p[r]; if (roll < sum) { *ret_k = r*Q + q + 1; return p7T_M;} } u.v = vec_madd(ox->dpf[i][q*3 + p7X_D], xEv, zerov); for (r = 0; r < 4; r++) { sum += u.p[r]; if (roll < sum) { *ret_k = r*Q + q + 1; return p7T_D;} } } ESL_DASSERT1(sum > 0.99); } /*UNREACHED*/ ESL_EXCEPTION(-1, "unreached code was reached. universe collapses."); }
/* M(i,k) is reached from B(i-1), M(i-1,k-1), D(i-1,k-1), or I(i-1,k-1). */ static inline int select_m(const P7_OPROFILE *om, const P7_OMX *ox, int i, int k) { int Q = p7O_NQF(ox->M); int q = (k-1) % Q; /* (q,r) is position of the current DP cell M(i,k) */ int r = (k-1) / Q; vector float *tp = om->tfv + 7*q; /* *tp now at start of transitions to cur cell M(i,k) */ vector float xBv; vector float zerov; vector float mpv, dpv, ipv; union { vector float v; float p[4]; } u, tv; float path[4]; int state[4] = { p7T_M, p7T_I, p7T_D, p7T_B }; xBv = esl_vmx_set_float(ox->xmx[(i-1)*p7X_NXCELLS+p7X_B]); zerov = (vector float) vec_splat_u32(0); if (q > 0) { mpv = ox->dpf[i-1][(q-1)*3 + p7X_M]; dpv = ox->dpf[i-1][(q-1)*3 + p7X_D]; ipv = ox->dpf[i-1][(q-1)*3 + p7X_I]; } else { mpv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_M], 12); dpv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_D], 12); ipv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_I], 12); } /* paths are numbered so that most desirable choice in case of tie is first. */ u.v = xBv; tv.v = *tp; path[3] = ((tv.p[r] == 0.0) ? -eslINFINITY : u.p[r]); tp++; u.v = mpv; tv.v = *tp; path[0] = ((tv.p[r] == 0.0) ? -eslINFINITY : u.p[r]); tp++; u.v = ipv; tv.v = *tp; path[1] = ((tv.p[r] == 0.0) ? -eslINFINITY : u.p[r]); tp++; u.v = dpv; tv.v = *tp; path[2] = ((tv.p[r] == 0.0) ? -eslINFINITY : u.p[r]); return state[esl_vec_FArgMax(path, 4)]; }
static void vector_fmul_window_altivec(float *dst, const float *src0, const float *src1, const float *win, int len) { vector float zero, t0, t1, s0, s1, wi, wj; const vector unsigned char reverse = vcprm(3,2,1,0); int i,j; dst += len; win += len; src0+= len; zero = (vector float)vec_splat_u32(0); for(i=-len*4, j=len*4-16; i<0; i+=16, j-=16) { s0 = vec_ld(i, src0); s1 = vec_ld(j, src1); wi = vec_ld(i, win); wj = vec_ld(j, win); s1 = vec_perm(s1, s1, reverse); wj = vec_perm(wj, wj, reverse); t0 = vec_madd(s0, wj, zero); t0 = vec_nmsub(s1, wi, t0); t1 = vec_madd(s0, wi, zero); t1 = vec_madd(s1, wj, t1); t1 = vec_perm(t1, t1, reverse); vec_st(t0, i, dst); vec_st(t1, j, dst); } }
/* D(i,k) is reached from M(i, k-1) or D(i,k-1). */ static inline int select_d(const P7_OPROFILE *om, const P7_OMX *ox, int i, int k) { int Q = p7O_NQF(ox->M); int q = (k-1) % Q; /* (q,r) is position of the current DP cell D(i,k) */ int r = (k-1) / Q; vector float zerov; union { vector float v; float p[4]; } mpv, dpv, tmdv, tddv; float path[2]; zerov = (vector float) vec_splat_u32(0); if (q > 0) { mpv.v = ox->dpf[i][(q-1)*3 + p7X_M]; dpv.v = ox->dpf[i][(q-1)*3 + p7X_D]; tmdv.v = om->tfv[7*(q-1) + p7O_MD]; tddv.v = om->tfv[7*Q + (q-1)]; } else { mpv.v = vec_sld(zerov, ox->dpf[i][(Q-1)*3 + p7X_M], 12); dpv.v = vec_sld(zerov, ox->dpf[i][(Q-1)*3 + p7X_D], 12); tmdv.v = vec_sld(zerov, om->tfv[7*(Q-1) + p7O_MD], 12); tddv.v = vec_sld(zerov, om->tfv[8*Q-1], 12); } path[0] = ((tmdv.p[r] == 0.0) ? -eslINFINITY : mpv.p[r]); path[1] = ((tddv.p[r] == 0.0) ? -eslINFINITY : dpv.p[r]); return ((path[0] >= path[1]) ? p7T_M : p7T_D); }
/* D(i,k) is reached from M(i, k-1) or D(i,k-1). */ static inline int select_d(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k) { int Q = p7O_NQF(ox->M); int q = (k-1) % Q; /* (q,r) is position of the current DP cell D(i,k) */ int r = (k-1) / Q; vector float zerov; vector float mpv, dpv; vector float tmdv, tddv; union { vector float v; float p[4]; } u; float path[2]; int state[2] = { p7T_M, p7T_D }; zerov = (vector float) vec_splat_u32(0); if (q > 0) { mpv = ox->dpf[i][(q-1)*3 + p7X_M]; dpv = ox->dpf[i][(q-1)*3 + p7X_D]; tmdv = om->tfv[7*(q-1) + p7O_MD]; tddv = om->tfv[7*Q + (q-1)]; } else { mpv = vec_sld(zerov, ox->dpf[i][(Q-1)*3 + p7X_M], 12); dpv = vec_sld(zerov, ox->dpf[i][(Q-1)*3 + p7X_D], 12); tmdv = vec_sld(zerov, om->tfv[7*(Q-1) + p7O_MD], 12); tddv = vec_sld(zerov, om->tfv[8*Q-1], 12); } u.v = vec_madd(mpv, tmdv, zerov); path[0] = u.p[r]; u.v = vec_madd(dpv, tddv, zerov); path[1] = u.p[r]; esl_vec_FNorm(path, 2); return state[esl_rnd_FChoose(rng, path, 2)]; }
static void vector_fmul_reverse_altivec(float *dst, const float *src0, const float *src1, int len) { int i; vector float d, s0, s1, h0, l0, s2, s3, zero = (vector float)vec_splat_u32(0); src1 += len-4; for(i=0; i<len-7; i+=8) { s1 = vec_ld(0, src1-i); // [a,b,c,d] s0 = vec_ld(0, src0+i); l0 = vec_mergel(s1, s1); // [c,c,d,d] s3 = vec_ld(-16, src1-i); h0 = vec_mergeh(s1, s1); // [a,a,b,b] s2 = vec_ld(16, src0+i); s1 = vec_mergeh(vec_mergel(l0,h0), // [d,b,d,b] vec_mergeh(l0,h0)); // [c,a,c,a] // [d,c,b,a] l0 = vec_mergel(s3, s3); d = vec_madd(s0, s1, zero); h0 = vec_mergeh(s3, s3); vec_st(d, 0, dst+i); s3 = vec_mergeh(vec_mergel(l0,h0), vec_mergeh(l0,h0)); d = vec_madd(s2, s3, zero); vec_st(d, 16, dst+i); } }
static void yuv2plane1_16_vsx(const int32_t *src, uint16_t *dest, int dstW, int big_endian, int output_bits) { const int dst_u = -(uintptr_t)dest & 7; const int shift = 3; const int add = (1 << (shift - 1)); const vector uint32_t vadd = (vector uint32_t) {add, add, add, add}; const vector uint16_t vswap = (vector uint16_t) vec_splat_u16(big_endian ? 8 : 0); const vector uint32_t vshift = (vector uint32_t) vec_splat_u32(shift); vector uint32_t v, v2; vector uint16_t vd; int i; yuv2plane1_16_u(src, dest, dst_u, big_endian, output_bits, 0); for (i = dst_u; i < dstW - 7; i += 8) { v = vec_vsx_ld(0, (const uint32_t *) &src[i]); v = vec_add(v, vadd); v = vec_sr(v, vshift); v2 = vec_vsx_ld(0, (const uint32_t *) &src[i + 4]); v2 = vec_add(v2, vadd); v2 = vec_sr(v2, vshift); vd = vec_packsu(v, v2); vd = vec_rl(vd, vswap); vec_st(vd, 0, &dest[i]); } yuv2plane1_16_u(src, dest, dstW, big_endian, output_bits, i); }
uint32_t sad8_altivec_c(const uint8_t * cur, const uint8_t *ref, const uint32_t stride) { uint32_t result = 0; register vector unsigned int sad; register vector unsigned char c; register vector unsigned char r; /* initialize */ sad = vec_splat_u32(0); /* Perform sad operations */ SAD8(); SAD8(); SAD8(); SAD8(); SAD8(); SAD8(); SAD8(); SAD8(); /* finish addition, add the first 2 together */ sad = vec_and(sad, (vector unsigned int)vec_pack(vec_splat_u16(-1),vec_splat_u16(0))); sad = (vector unsigned int)vec_sums((vector signed int)sad, vec_splat_s32(0)); sad = vec_splat(sad,3); vec_ste(sad, 0, &result); return result; }
int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char *tv; vector unsigned char pix1v, pix2v, pix3v, avgv, t5; vector unsigned int sad; vector signed int sumdiffs; uint8_t *pix3 = pix2 + line_size; s = 0; sad = (vector unsigned int)vec_splat_u32(0); /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one iteration becomes pix2 in the next iteration. We can use this fact to avoid a potentially expensive unaligned read, each time around the loop. Read unaligned pixels into our vectors. The vectors are as follows: pix2v: pix2[0]-pix2[15] Split the pixel vectors into shorts */ tv = (vector unsigned char *) &pix2[0]; pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); for(i=0;i<16;i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix3v: pix3[0]-pix3[15] */ tv = (vector unsigned char *) pix1; pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); tv = (vector unsigned char *) &pix3[0]; pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); /* Calculate the average vector */ avgv = vec_avg(pix2v, pix3v); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2v = pix3v; pix3 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
void rgbaint_t::blend(const rgbaint_t& other, UINT8 factor) { const VECU32 shift = vec_splat_u32(-16); const VECS32 scale1 = { factor, factor, factor, factor }; const VECS32 scale2 = { 0x100 - factor, 0x100 - factor, 0x100 - factor, 0x100 - factor, }; VECU32 temp = vec_msum(VECU16(m_value), VECU16(vec_rl(scale1, shift)), vec_splat_u32(0)); temp = vec_msum(VECU16(other.m_value), VECU16(vec_rl(scale2, shift)), temp); #if defined __LITTLE_ENDIAN__ m_value = VECS32(vec_msum(VECU16(m_value), VECU16(scale1), vec_mule(VECU16(other.m_value), VECU16(scale2)))); #else m_value = VECS32(vec_msum(VECU16(m_value), VECU16(scale1), vec_mulo(VECU16(other.m_value), VECU16(scale2)))); #endif m_value = VECS32(vec_add(vec_sl(temp, shift), VECU32(m_value))); sra_imm(8); }
int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sad; vector signed int sumdiffs; sad = (vector unsigned int)vec_splat_u32(0); permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); for(i=0;i<8;i++) { /* Read potentially unaligned pixels into t1 and t2 Since we're reading 16 pixels, and actually only want 8, mask out the last 8 pixels. The 0s don't change the sum. */ perm1 = vec_lvsl(0, pix1); pix1v = (vector unsigned char *) pix1; perm2 = vec_lvsl(0, pix2); pix2v = (vector unsigned char *) pix2; t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); /* Calculate a sum of abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
uint32_t sad16_altivec_c(vector unsigned char *cur, vector unsigned char *ref, uint32_t stride, const uint32_t best_sad) { vector unsigned char perm; vector unsigned char t1, t2; vector unsigned int sad; vector unsigned int sumdiffs; vector unsigned int best_vec; uint32_t result; #ifdef DEBUG /* print alignment errors if DEBUG is on */ if (((unsigned long) cur) & 0xf) fprintf(stderr, "sad16_altivec:incorrect align, cur: %lx\n", (long)cur); if (stride & 0xf) fprintf(stderr, "sad16_altivec:incorrect align, stride: %lu\n", stride); #endif /* initialization */ sad = vec_splat_u32(0); sumdiffs = sad; stride >>= 4; perm = vec_lvsl(0, (unsigned char *) ref); *((uint32_t*)&best_vec) = best_sad; best_vec = vec_splat(best_vec, 0); /* perform sum of differences between current and previous */ SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); SAD16(); bail: /* copy vector sum into unaligned result */ sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, (uint32_t*) &result); return result; }
uint32_t dequant_mpeg_intra_altivec_c(int16_t * data, const int16_t * coeff, const uint32_t quant, const uint32_t dcscalar, const uint16_t * mpeg_quant_matrices) { register const uint16_t *intra_matrix = get_intra_matrix(mpeg_quant_matrices); register const int16_t *coeff_ptr = coeff; register int16_t *data_ptr = data; register vec_sint16_t ox00; register vec_sint16_t level; register vec_sint16_t vec_2048; register vec_uint16_t vintra; register vec_uint32_t swap; register vec_uint32_t even,odd; register vec_uint32_t et,ot,t; vec_uint32_t vquant; vector bool short zero_less; vector bool short overflow; #ifdef DEBUG if((long)data & 0xf) fprintf(stderr, "xvidcore: error in dequant_mpeg_intra_altivec_c, incorrect align: %x\n", data); #endif /* Initialize */ ox00 = vec_splat_s16(0); *((uint32_t*)&vquant) = quant; vquant = vec_splat(vquant,0); swap = vec_rl(vquant, vec_splat_u32(-16)); vec_2048 = (vec_sint16_t)vec_rl(vec_splat_u16(8),vec_splat_u16(8)); DEQUANT_MPEG_INTRA(); DEQUANT_MPEG_INTRA(); DEQUANT_MPEG_INTRA(); DEQUANT_MPEG_INTRA(); DEQUANT_MPEG_INTRA(); DEQUANT_MPEG_INTRA(); DEQUANT_MPEG_INTRA(); DEQUANT_MPEG_INTRA(); /* Process the first */ data[0] = coeff[0] * dcscalar; if (data[0] < -2048) { data[0] = -2048; } else if (data[0] > 2047) { data[0] = 2047; } return 0; }
void foo (void) { vector bool int boolVec1 = (vector bool int) vec_splat_u32(3); vector bool short boolVec2 = (vector bool short) vec_splat_u16(3); vector bool char boolVec3 = (vector bool char) vec_splat_u8(3); boolVec1 = vec_sld( boolVec1, boolVec1, 4 ); boolVec2 = vec_sld( boolVec2, boolVec2, 2 ); boolVec3 = vec_sld( boolVec3, boolVec3, 1 ); }
static void yuv2planeX_16_altivec(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, const uint8_t *dither, int offset, int x) { register int i, j; DECLARE_ALIGNED(16, int, val)[16]; vector signed int vo1, vo2, vo3, vo4; vector unsigned short vs1, vs2; vector unsigned char vf; vector unsigned int altivec_vectorShiftInt19 = vec_add(vec_splat_u32(10), vec_splat_u32(9)); for (i = 0; i < 16; i++) val[i] = dither[(x + i + offset) & 7] << 12; vo1 = vec_ld(0, val); vo2 = vec_ld(16, val); vo3 = vec_ld(32, val); vo4 = vec_ld(48, val); for (j = 0; j < filterSize; j++) { vector signed short l1, vLumFilter = vec_ld(j << 1, filter); vector unsigned char perm, perm0 = vec_lvsl(j << 1, filter); vLumFilter = vec_perm(vLumFilter, vLumFilter, perm0); vLumFilter = vec_splat(vLumFilter, 0); // lumFilter[j] is loaded 8 times in vLumFilter perm = vec_lvsl(x << 1, src[j]); l1 = vec_ld(x << 1, src[j]); yuv2planeX_8(vo1, vo2, l1, src[j], x, perm, vLumFilter); yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter); } vo1 = vec_sra(vo1, altivec_vectorShiftInt19); vo2 = vec_sra(vo2, altivec_vectorShiftInt19); vo3 = vec_sra(vo3, altivec_vectorShiftInt19); vo4 = vec_sra(vo4, altivec_vectorShiftInt19); vs1 = vec_packsu(vo1, vo2); vs2 = vec_packsu(vo3, vo4); vf = vec_packsu(vs1, vs2); vec_st(vf, 0, dest); }
void evas_common_cpu_altivec_test(void) { #ifdef __POWERPC__ #ifdef __VEC__ vector unsigned int zero; zero = vec_splat_u32(0); #endif /* __VEC__ */ #endif /* __POWERPC__ */ }
static void yuv2planeX_16_altivec(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, const uint8_t *dither, int offset, int x) { register int i, j; LOCAL_ALIGNED(16, int, val, [16]); vector signed int vo1, vo2, vo3, vo4; vector unsigned short vs1, vs2; vector unsigned char vf; vector unsigned int altivec_vectorShiftInt19 = vec_add(vec_splat_u32(10), vec_splat_u32(9)); for (i = 0; i < 16; i++) val[i] = dither[(x + i + offset) & 7] << 12; vo1 = vec_ld(0, val); vo2 = vec_ld(16, val); vo3 = vec_ld(32, val); vo4 = vec_ld(48, val); for (j = 0; j < filterSize; j++) { unsigned int joffset=j<<1; unsigned int xoffset=x<<1; vector unsigned char perm; vector signed short l1,vLumFilter; LOAD_FILTER(vLumFilter,filter); vLumFilter = vec_splat(vLumFilter, 0); LOAD_L1(l1,src[j],perm); yuv2planeX_8(vo1, vo2, l1, src[j], x, perm, vLumFilter); yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter); } vo1 = vec_sra(vo1, altivec_vectorShiftInt19); vo2 = vec_sra(vo2, altivec_vectorShiftInt19); vo3 = vec_sra(vo3, altivec_vectorShiftInt19); vo4 = vec_sra(vo4, altivec_vectorShiftInt19); vs1 = vec_packsu(vo1, vo2); vs2 = vec_packsu(vo3, vo4); vf = vec_packsu(vs1, vs2); VEC_ST(vf, 0, dest); }
int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sad; vector signed int sumdiffs; sad = (vector unsigned int)vec_splat_u32(0); for(i=0;i<16;i++) { /* Read potentially unaligned pixels into t1 and t2 */ perm1 = vec_lvsl(0, pix1); pix1v = (vector unsigned char *) pix1; perm2 = vec_lvsl(0, pix2); pix2v = (vector unsigned char *) pix2; t1 = vec_perm(pix1v[0], pix1v[1], perm1); t2 = vec_perm(pix2v[0], pix2v[1], perm2); /* Calculate a sum of abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
static inline void altivec_packIntArrayToCharArray(int *val, uint8_t* dest, int dstW) { register int i; vector unsigned int altivec_vectorShiftInt19 = vec_add(vec_splat_u32(10), vec_splat_u32(9)); if ((uintptr_t)dest % 16) { /* badly aligned store, we force store alignment */ /* and will handle load misalignment on val w/ vec_perm */ vector unsigned char perm1; vector signed int v1; for (i = 0 ; (i < dstW) && (((uintptr_t)dest + i) % 16) ; i++) { int t = val[i] >> 19; dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t); } perm1 = vec_lvsl(i << 2, val); v1 = vec_ld(i << 2, val); for ( ; i < (dstW - 15); i+=16) { int offset = i << 2; vector signed int v2 = vec_ld(offset + 16, val); vector signed int v3 = vec_ld(offset + 32, val); vector signed int v4 = vec_ld(offset + 48, val); vector signed int v5 = vec_ld(offset + 64, val); vector signed int v12 = vec_perm(v1, v2, perm1); vector signed int v23 = vec_perm(v2, v3, perm1); vector signed int v34 = vec_perm(v3, v4, perm1); vector signed int v45 = vec_perm(v4, v5, perm1); vector signed int vA = vec_sra(v12, altivec_vectorShiftInt19); vector signed int vB = vec_sra(v23, altivec_vectorShiftInt19); vector signed int vC = vec_sra(v34, altivec_vectorShiftInt19); vector signed int vD = vec_sra(v45, altivec_vectorShiftInt19); vector unsigned short vs1 = vec_packsu(vA, vB); vector unsigned short vs2 = vec_packsu(vC, vD); vector unsigned char vf = vec_packsu(vs1, vs2); vec_st(vf, i, dest); v1 = v5; } } else { // dest is properly aligned, great
static int32_t scalarproduct_int16_altivec(const int16_t * v1, const int16_t * v2, int order, const int shift) { int i; LOAD_ZERO; register vec_s16 vec1, *pv; register vec_s32 res = vec_splat_s32(0), t; register vec_u32 shifts; int32_t ires; shifts = zero_u32v; if(shift & 0x10) shifts = vec_add(shifts, vec_sl(vec_splat_u32(0x08), vec_splat_u32(0x1))); if(shift & 0x08) shifts = vec_add(shifts, vec_splat_u32(0x08)); if(shift & 0x04) shifts = vec_add(shifts, vec_splat_u32(0x04)); if(shift & 0x02) shifts = vec_add(shifts, vec_splat_u32(0x02)); if(shift & 0x01) shifts = vec_add(shifts, vec_splat_u32(0x01)); for(i = 0; i < order; i += 8){ pv = (vec_s16*)v1; vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1)); t = vec_msum(vec1, vec_ld(0, v2), zero_s32v); t = vec_sr(t, shifts); res = vec_sums(t, res); v1 += 8; v2 += 8; } res = vec_splat(res, 3); vec_ste(res, 0, &ires); return ires; }
static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; int s; const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm = vec_lvsl(0, pix2); vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sad; vector signed int sumdiffs; sad = (vector unsigned int)vec_splat_u32(0); for (i = 0; i < h; i++) { /* Read potentially unaligned pixels into t1 and t2 */ vector unsigned char pix2l = vec_ld( 0, pix2); vector unsigned char pix2r = vec_ld(15, pix2); t1 = vec_ld(0, pix1); t2 = vec_perm(pix2l, pix2r, perm); /* Calculate a sum of abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
static void vorbis_inverse_coupling_altivec(float *mag, float *ang, intptr_t blocksize) { int i; vector float m, a; vector bool int t0, t1; const vector unsigned int v_31 = //XXX vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1)); for (i = 0; i < blocksize; i += 4) { m = vec_ld(0, mag+i); a = vec_ld(0, ang+i); t0 = vec_cmple(m, (vector float)vec_splat_u32(0)); t1 = vec_cmple(a, (vector float)vec_splat_u32(0)); a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31)); t0 = (vector bool int)vec_and(a, t1); t1 = (vector bool int)vec_andc(a, t1); a = vec_sub(m, (vector float)t1); m = vec_add(m, (vector float)t0); vec_stl(a, 0, ang+i); vec_stl(m, 0, mag+i); } }
static void vector_fmul_altivec(float *dst, const float *src, int len) { int i; vector float d0, d1, s, zero = (vector float)vec_splat_u32(0); for(i=0; i<len-7; i+=8) { d0 = vec_ld(0, dst+i); s = vec_ld(0, src+i); d1 = vec_ld(16, dst+i); d0 = vec_madd(d0, s, zero); d1 = vec_madd(d1, vec_ld(16,src+i), zero); vec_st(d0, 0, dst+i); vec_st(d1, 16, dst+i); } }