cv::Mat HogIntegralImageComputer::compute_block_features(const cv::Point &start, const cv::Point &end) const { // sanity checks assert(end.y < _integral_image.rows-1 && end.x < _integral_image.cols-1 && end.x >= 0 && end.y >= 0); assert(start.y < _integral_image.rows-1 && start.x < _integral_image.cols-1 && start.x >= 0 && start.y >= 0); // We have top, middle and bottom blocks // The border of blocks might overlap (for simplicity reasons) int h = end.y - start.y + 1; if (h < 3) { //return cv::Mat::zeros(1, 8*3, CV_32FC1); return cv::Mat::zeros(1, CC_HOG_CHANS*3, CV_32FC1); } int h_top = h * 0.2; // the first border int h_bottom = h * 0.8; // the second border h_top = std::min(h_top, h - 3); h_bottom = std::min(h_bottom, h - 2); cv::Point start1 = start; cv::Point end1 = cv::Point(end.x, start.y + h_top); cv::Point start2 = cv::Point(start.x, start.y + h_top + 1); cv::Point end2 = cv::Point(end.x, start.y + h_bottom); cv::Point start3 = cv::Point(start.x, start.y + h_bottom + 1); cv::Point end3 = cv::Point(end.x, end.y); cv::VecHogf v1 = vec_max(query_ii(start1, end1), cv::VecHogf(0.0f)); cv::VecHogf v2 = vec_max(query_ii(start2, end2), cv::VecHogf(0.0f)); cv::VecHogf v3 = vec_max(query_ii(start3, end3), cv::VecHogf(0.0f)); cv::VecHogf v_norm = v1 + v2 + v3; assert(v_norm[4] + 1e-5 > 0); v1 = v1 / (v_norm[CC_HOG_CHANS] + 1e-5); v2 = v2 / (v_norm[CC_HOG_CHANS] + 1e-5); v3 = v3 / (v_norm[CC_HOG_CHANS] + 1e-5); #if CC_HOG_CHANS == 4 cv::Mat result = (cv::Mat_<float>(1,CC_HOG_CHANS*3) << v1[0], v1[1], v1[2], v1[3], v2[0], v2[1], v2[2], v2[3], v3[0], v3[1], v3[2], v3[3]); #else cv::Mat result = (cv::Mat_<float>(1,CC_HOG_CHANS*3) << v1[0], v1[1], v1[2], v1[3], v1[4], v1[5], v1[6], v1[7], v2[0], v2[1], v2[2], v2[3], v2[4], v2[5], v2[6], v2[7], v3[0], v3[1], v3[2], v3[3], v3[4], v3[5], v3[6], v3[7]); #endif cv::sqrt(result, result); return result; }
SIMD_INLINE void PartialSort5(v128_u8 a[5]) { SortU8(a[2], a[3]); SortU8(a[1], a[2]); SortU8(a[2], a[3]); a[4] = vec_max(a[1], a[4]); a[0] = vec_min(a[0], a[3]); SortU8(a[2], a[0]); a[2] = vec_max(a[4], a[2]); a[2] = vec_min(a[2], a[0]); }
void DP_alpha_prey_w(PARAM *param, PRIOR *prior, DATA *data, const gsl_rng *r, int pid) { int i,j,id; float cur_alpha_prey, tmp_lambda, maxl, tmp; float prob[_MAX_COMP_]; for(i=0;i<_MAX_COMP_;i++) prob[i] = log(prior->gamma_alpha_prey[i]); cur_alpha_prey = param->alpha_prey[pid]; for(i=0;i<_MAX_COMP_;i++) { for(j=0;j<data->preyNinter[pid];j++) { id = data->p2i[pid][j]; if(param->Z[data->a2u[id]]) { tmp_lambda = param->lambda_true[id] + prior->theta_alpha_prey[i] - cur_alpha_prey; tmp = GSL_MIN(50.0, data->d[id]); if(tmp > 0.0) prob[i] += log_poisson_g_prop(tmp, exp(tmp_lambda), param->eta[pid]); } } } maxl = vec_max(prob, _MAX_COMP_); for(i=0;i<_MAX_COMP_;i++) prob[i] -= maxl; for(i=0;i<_MAX_COMP_;i++) prob[i] = exp(prob[i]); prior->w_alpha_prey[pid] = ranMultinom(r, prob, _MAX_COMP_); param->alpha_prey[pid] = prior->theta_alpha_prey[prior->w_alpha_prey[pid]]; for(j=0;j<data->preyNinter[pid];j++) { id = data->p2i[pid][j]; param->lambda_true[id] += param->alpha_prey[pid] - cur_alpha_prey; } }
// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0, register vec_u8_t p1, register vec_u8_t p2, register vec_u8_t q0, register vec_u8_t tc0) { register vec_u8_t average = vec_avg(p0, q0); register vec_u8_t temp; register vec_u8_t uncliped; register vec_u8_t ones; register vec_u8_t max; register vec_u8_t min; register vec_u8_t newp1; temp = vec_xor(average, p2); average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ ones = vec_splat_u8(1); temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */ max = vec_adds(p1, tc0); min = vec_subs(p1, tc0); newp1 = vec_max(min, uncliped); newp1 = vec_min(max, newp1); return newp1; }
void DP_eta_w(PARAM *param, PRIOR *prior, DATA *data, const gsl_rng *r, int pid) { int i,j,id; float cur_eta, tmp_lambda, maxl, tmp; float prob[_MAX_COMP_]; for(i=0;i<_MAX_COMP_;i++) prob[i] = log(prior->gamma_eta[i]); cur_eta = param->eta[pid]; for(i=0;i<_MAX_COMP_;i++) { for(j=0;j<data->preyNinter[pid];j++) { id = data->p2i[pid][j]; if(param->Z[data->a2u[id]] && data->d[id] > 0.0) { tmp_lambda = param->lambda_true[id]; //else tmp_lambda = param->lambda_false[id]; if(lowMode) tmp = GSL_MIN(_LM_, data->d[id]); else tmp = data->d[id]; if(data->d[id] > 0.0) prob[i] += log_poisson_g_prop(tmp, exp(tmp_lambda), prior->theta_eta[i]); } } } maxl = vec_max(prob, _MAX_COMP_); for(i=0;i<_MAX_COMP_;i++) prob[i] -= maxl; for(i=0;i<_MAX_COMP_;i++) prob[i] = exp(prob[i]); prior->w_eta[pid] = ranMultinom(r, prob, _MAX_COMP_); param->eta[pid] = prior->theta_eta[prior->w_eta[pid]]; }
// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) static inline vector unsigned char h264_deblock_q1(register vector unsigned char p0, register vector unsigned char p1, register vector unsigned char p2, register vector unsigned char q0, register vector unsigned char tc0) { register vector unsigned char average = vec_avg(p0, q0); register vector unsigned char temp; register vector unsigned char uncliped; register vector unsigned char ones; register vector unsigned char max; register vector unsigned char min; register vector unsigned char newp1; temp = vec_xor(average, p2); average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ ones = vec_splat_u8(1); temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */ max = vec_adds(p1, tc0); min = vec_subs(p1, tc0); newp1 = vec_max(min, uncliped); newp1 = vec_min(max, newp1); return newp1; }
void DP_mu_w(PARAM *param, PRIOR *prior, DATA *data, const gsl_rng *r, int pid) { int i,j,id; float cur_mu, tmp_lambda, maxl, tmp; float prob[_MAX_COMP_]; for(i=0;i<_MAX_COMP_;i++) prob[i] = log(prior->gamma_mu[i]); cur_mu = param->mu[pid]; for(i=0;i<_MAX_COMP_;i++) { for(j=0;j<data->preyNinter[pid];j++) { id = data->p2i[pid][j]; if(data->ctrl[data->i2IP[id]]) { tmp_lambda = param->lambda_false[id] + prior->theta_mu[i] - cur_mu; tmp = data->d[id]; prob[i] += log_poisson_g_prop(tmp, exp(tmp_lambda), param->eta0[data->i2p[id]]); } } } maxl = vec_max(prob, _MAX_COMP_); for(i=0;i<_MAX_COMP_;i++) prob[i] -= maxl; for(i=0;i<_MAX_COMP_;i++) prob[i] = exp(prob[i]); prior->w_mu[pid] = ranMultinom(r, prob, _MAX_COMP_); param->mu[pid] = prior->theta_mu[prior->w_mu[pid]]; for(j=0;j<data->preyNinter[pid];j++) { id = data->p2i[pid][j]; param->lambda_false[id] += param->mu[pid] - cur_mu; } }
SIMD_INLINE void PartialSort9(v128_u8 a[9]) { SortU8(a[1], a[2]); SortU8(a[4], a[5]); SortU8(a[7], a[8]); SortU8(a[0], a[1]); SortU8(a[3], a[4]); SortU8(a[6], a[7]); SortU8(a[1], a[2]); SortU8(a[4], a[5]); SortU8(a[7], a[8]); a[3] = vec_max(a[0], a[3]); a[5] = vec_min(a[5], a[8]); SortU8(a[4], a[7]); a[6] = vec_max(a[3], a[6]); a[4] = vec_max(a[1], a[4]); a[2] = vec_min(a[2], a[5]); a[4] = vec_min(a[4], a[7]); SortU8(a[4], a[2]); a[4] = vec_max(a[6], a[4]); a[4] = vec_min(a[4], a[2]); }
void DP_alpha_IP_w(PARAM *param, PRIOR *prior, DATA *data, const gsl_rng *r, int pid) { int i,j,id; float cur_alpha_IP, tmp_lambda, maxl; float prob[_MAX_COMP_]; for(i=0;i<_MAX_COMP_;i++) prob[i] = log(prior->gamma_alpha_IP[i]); cur_alpha_IP = param->alpha_IP[pid]; for(i=0;i<_MAX_COMP_;i++) { for(j=0;j<data->IPNinter[pid];j++) { id = data->IP2i[pid][j]; if(param->Z[data->a2u[id]]) { tmp_lambda = param->lambda_true[id] + prior->theta_alpha_IP[i] - cur_alpha_IP; prob[i] += log_gaussian(data->d[id], (tmp_lambda), param->eta[data->i2p[id]]); } } } maxl = vec_max(prob, _MAX_COMP_); for(i=0;i<_MAX_COMP_;i++) prob[i] -= maxl; for(i=0;i<_MAX_COMP_;i++) prob[i] = exp(prob[i]); prior->w_alpha_IP[pid] = ranMultinom(r, prob, _MAX_COMP_); param->alpha_IP[pid] = prior->theta_alpha_IP[prior->w_alpha_IP[pid]]; for(j=0;j<data->IPNinter[pid];j++) { id = data->IP2i[pid][j]; param->lambda_true[id] += param->alpha_IP[pid] - cur_alpha_IP; } }
int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char *tv; vector unsigned char pix1v, pix2v, pix3v, avgv, t5; vector unsigned int sad; vector signed int sumdiffs; uint8_t *pix3 = pix2 + line_size; s = 0; sad = (vector unsigned int)vec_splat_u32(0); /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one iteration becomes pix2 in the next iteration. We can use this fact to avoid a potentially expensive unaligned read, each time around the loop. Read unaligned pixels into our vectors. The vectors are as follows: pix2v: pix2[0]-pix2[15] Split the pixel vectors into shorts */ tv = (vector unsigned char *) &pix2[0]; pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); for(i=0;i<16;i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix3v: pix3[0]-pix3[15] */ tv = (vector unsigned char *) pix1; pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); tv = (vector unsigned char *) &pix3[0]; pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); /* Calculate the average vector */ avgv = vec_avg(pix2v, pix3v); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2v = pix3v; pix3 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
void gimp_composite_lighten_rgba8_rgba8_rgba8_altivec (GimpCompositeContext *ctx) { const guchar *A = ctx->A; const guchar *B = ctx->B; guchar *D = ctx->D; guint length = ctx->n_pixels; vector unsigned char a,b,d,alpha_a,alpha_b; while (length >= 4) { a=LoadUnaligned(A); b=LoadUnaligned(B); alpha_a=vec_and(a, alphamask); alpha_b=vec_and(b, alphamask); d=vec_min(alpha_a, alpha_b); a=vec_andc(a, alphamask); a=vec_adds(a, d); b=vec_andc(b, alphamask); d=vec_max(a, b); StoreUnaligned(d, D); A+=16; B+=16; D+=16; length-=4; } /* process last pixels */ length = length*4; a=LoadUnalignedLess(A, length); b=LoadUnalignedLess(B, length); alpha_a=vec_and(a,alphamask); alpha_b=vec_and(b,alphamask); d=vec_min(alpha_a,alpha_b); a=vec_andc(a,alphamask); a=vec_adds(a,d); b=vec_andc(b,alphamask); d=vec_max(a, b); StoreUnalignedLess(d, D, length); }
double log_sum(double *x, int len) { int i; double res, sum; sum = vec_max(x, len); for(i=0;i<len;i++) x[i] -= sum; for(i=0;i<len;i++) x[i] = exp(x[i]); res = vec_sum(x, len); return res; }
__SIMDd _SIMD_max_pd(__SIMDd a, __SIMDd b) { #ifdef USE_SSE return _mm_max_pd(a,b); #elif defined USE_AVX return _mm256_max_pd(a,b); #elif defined USE_IBM return vec_max(a,b); #endif }
folgen_vektor_p folgen_vektor_add(folgen_vektor_p f, folgen_vektor_p g) { folgen_vektor_p back; int k, d, size, dim, a, b, test_f, test_g; vec_p max, vec_1, n, n_f, n_g, r; ASSERT( f->grad->dim == g->grad->dim ); dim = f->grad->dim; max = vec_max( f->grad, g->grad ); vec_1 = vec_one( dim ); n = vec_add( vec_1, max ); n_f = vec_add( vec_1, f->grad ); n_g = vec_add( vec_1, g->grad ); size = vec_size( n ); back = folgen_vektor_new( max ); for(k=0;k<size;k++) { r = entry_one2d( k, n ); test_f = 0; test_g = 0; for(d=0;d<dim;d++) { if( r->array[d] > f->grad->array[d] ) { test_f = test_f + 1; } if( r->array[d] > g->grad->array[d] ) { test_g = test_g + 1; } } if( (test_f == 0) && (test_g == 0) ) { a = entry_d2one( r, n_f ); b = entry_d2one( r, n_g ); folge_del( back->vektor[k] ); back->vektor[k] = folge_add( f->vektor[a], g->vektor[b] ); } if( (test_f != 0) && (test_g == 0) ) { b = entry_d2one( r, n_g ); folge_del( back->vektor[k] ); back->vektor[k] = folge_copy( g->vektor[b] ); } if( (test_f == 0) && (test_g != 0) ) { a = entry_d2one( r, n_f ); folge_del( back->vektor[k] ); back->vektor[k] = folge_copy( f->vektor[a] ); } vec_del( r ); } vec_del( n ); vec_del( n_f ); vec_del( n_g ); vec_del( vec_1 ); return back; }
/* Assumes input points to the beginning of the SSS symbol. The SSS symbol start is * given by SSS_SYMBOL_ST() macro in sss.h. * Estimates the m0 and m1 values and saves in m0_value and m1_value * the resulted correlation (higher is more likely) * */ void sss_synch_m0m1(sss_synch_t *q, cf_t *input, int *m0, float *m0_value, int *m1, float *m1_value) { /* This is aprox 3-4 kbytes of stack. Consider moving to sss_synch_t?? */ cf_t zdelay[N_SSS+1],zconj[N_SSS+1],zprod[N_SSS+1]; cf_t y[2][N_SSS+1], z[N_SSS+1], tmp[N_SSS+1]; float tmp_real[N_SSS+1]; cf_t input_fft[SSS_DFT_LEN]; int i; dft_run_c2c(&q->dftp_input, input, input_fft); for (i = 0; i < N_SSS; i++) { y[0][i] = input_fft[SSS_POS_SYMBOL + 2 * i]; y[1][i] = input_fft[SSS_POS_SYMBOL + 2 * i + 1]; } vec_dot_prod(y[0], q->fc_tables.c[0], z, N_SSS); memcpy(zdelay, &z[1], (N_SSS - 1) * sizeof(cf_t)); vec_conj(z, zconj, N_SSS - 1); vec_dot_prod(zdelay, zconj, zprod, N_SSS - 1); corr_all_zs(zprod, q->fc_tables.s, tmp); vec_abs(tmp, tmp_real, N_SSS); vec_max(tmp_real, m0_value, m0, N_SSS); vec_dot_prod(y[1], q->fc_tables.c[1], tmp, N_SSS); vec_dot_prod(tmp, q->fc_tables.z1[*m0], z, N_SSS); memcpy(zdelay, &z[1], (N_SSS - 1) * sizeof(cf_t)); vec_conj(z, zconj, N_SSS - 1); vec_dot_prod(zdelay, zconj, zprod, N_SSS - 1); corr_all_zs(zprod, q->fc_tables.s, tmp); vec_abs(tmp, tmp_real, N_SSS); vec_max(tmp_real, m1_value, m1, N_SSS); }
/** * Sum of Squared Errors for a 8x8 block. * AltiVec-enhanced. * It's the pix_abs8x8_altivec code above w/ squaring added. */ int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sum; vector signed int sumsqr; sum = (vector unsigned int)vec_splat_u32(0); permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); for(i=0;i<8;i++) { /* Read potentially unaligned pixels into t1 and t2 Since we're reading 16 pixels, and actually only want 8, mask out the last 8 pixels. The 0s don't change the sum. */ perm1 = vec_lvsl(0, pix1); pix1v = (vector unsigned char *) pix1; perm2 = vec_lvsl(0, pix2); pix2v = (vector unsigned char *) pix2; t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); /* Since we want to use unsigned chars, we can take advantage of the fact that abs(a-b)^2 = (a-b)^2. */ /* Calculate abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Square the values and add them to our sum */ sum = vec_msum(t5, t5, sum); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); sumsqr = vec_splat(sumsqr, 3); vec_ste(sumsqr, 0, &s); return s; }
/*! <em>Pointwise Maximum</em> of vA and vB. */ inline vec4i vec4i_max(vec4i vA, vec4i vB) { #if defined(__ALTIVEC__) /* AltiVec */ return vec_max((vector int)vA, (vector int)vB); #else /* Scalar */ int * s1 = (int*)&vA; int * s2 = (int*)&vB; vec4i max; int * smax = (int*)&max; smax[0] = MAX(s1[0], s2[0]); smax[1] = MAX(s1[1], s2[1]); smax[2] = MAX(s1[2], s2[2]); smax[3] = MAX(s1[3], s2[3]); return max; #endif }
int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char *tv; vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; vector unsigned int sad; vector signed int sumdiffs; s = 0; sad = (vector unsigned int)vec_splat_u32(0); for(i=0;i<16;i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */ tv = (vector unsigned char *) pix1; pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); tv = (vector unsigned char *) &pix2[0]; pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); tv = (vector unsigned char *) &pix2[1]; pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); /* Calculate the average vector */ avgv = vec_avg(pix2v, pix2iv); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sad; vector signed int sumdiffs; sad = (vector unsigned int)vec_splat_u32(0); permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); for(i=0;i<8;i++) { /* Read potentially unaligned pixels into t1 and t2 Since we're reading 16 pixels, and actually only want 8, mask out the last 8 pixels. The 0s don't change the sum. */ perm1 = vec_lvsl(0, pix1); pix1v = (vector unsigned char *) pix1; perm2 = vec_lvsl(0, pix2); pix2v = (vector unsigned char *) pix2; t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); /* Calculate a sum of abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; int s; const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char perm1 = vec_lvsl(0, pix2); vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); vector unsigned char pix2l, pix2r; vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; vector unsigned int sad; vector signed int sumdiffs; s = 0; sad = (vector unsigned int)vec_splat_u32(0); for (i = 0; i < h; i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: pix1v: pix1[0]-pix1[15] pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */ pix1v = vec_ld( 0, pix1); pix2l = vec_ld( 0, pix2); pix2r = vec_ld(16, pix2); pix2v = vec_perm(pix2l, pix2r, perm1); pix2iv = vec_perm(pix2l, pix2r, perm2); /* Calculate the average vector */ avgv = vec_avg(pix2v, pix2iv); /* Calculate a sum of abs differences vector */ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
folge_p folge_add(folge_p f, folge_p g) { folge_p back; int size, k, size_g, size_f; fepc_real_t x, y; vec_p temp1, temp2, max, lang, min, r; size_f = vec_size( f->lang ); size_g = vec_size( g->lang ); if(size_f == 0) { back = folge_copy( g ); return back; } if(size_g == 0) { back = folge_copy( f ); return back; } if( (size_g!=0) && (size_f!=0) ) { min = vec_min( f->start, g->start ); temp1 = vec_add( f->start, f->lang ); temp2 = vec_add( g->start, g->lang ); max = vec_max( temp1, temp2 ); vec_del( temp1 ); vec_del( temp2 ); lang = vec_op( 1, max, -1, min); vec_del( max ); back = folge_new( min, lang ); size = vec_size( lang ); for(k=0;k<size;k++) { temp1 = entry_one2d( k, lang ); r = vec_add( min, temp1 ); vec_del( temp1 ); x = folge_glied( r, f ); y = folge_glied( r, g ); vec_del( r ); back->glied[k] = x + y; } return back; } }
static unsigned reg_sad_altivec(const kvz_pixel * const data1, const kvz_pixel * const data2, const int width, const int height, const unsigned stride1, const unsigned stride2) { vector unsigned int vsad = {0,0,0,0}, vzero = {0,0,0,0}; vector signed int sumdiffs; int tmpsad, sad = 0; int y, x; for (y = 0; y < height; ++y) { vector unsigned char perm1, perm2; perm1 = vec_lvsl(0, &data1[y * stride1]); perm2 = vec_lvsl(0, &data2[y * stride2]); for (x = 0; x <= width-16; x+=16) { vector unsigned char t1, t2, t3, t4, t5; vector unsigned char *current, *previous; current = (vector unsigned char *) &data1[y * stride1 + x]; previous = (vector unsigned char *) &data2[y * stride2 + x]; t1 = vec_perm(current[0], current[1], perm1 ); /* align current vector */ t2 = vec_perm(previous[0], previous[1], perm2 );/* align previous vector */ t3 = vec_max(t1, t2 ); /* find largest of two */ t4 = vec_min(t1, t2 ); /* find smaller of two */ t5 = vec_sub(t3, t4); /* find absolute difference */ vsad = vec_sum4s(t5, vsad); /* accumulate sum of differences */ } for (; x < width; ++x) { sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]); } } sumdiffs = vec_sums((vector signed int) vsad, (vector signed int) vzero); /* copy vector sum into unaligned result */ sumdiffs = vec_splat( sumdiffs, 3); vec_ste( sumdiffs, 0, &tmpsad ); sad += tmpsad; return sad; }
void DP_eta_w(PARAM *param, PRIOR *prior, DATA *data, const gsl_rng *r, int pid) { int i,j,id; float cur_eta, tmp_lambda, maxl; float prob[_MAX_COMP_]; for(i=0;i<_MAX_COMP_;i++) prob[i] = log(prior->gamma_eta[i]); cur_eta = param->eta[pid]; for(i=0;i<_MAX_COMP_;i++) { for(j=0;j<data->preyNinter[pid];j++) { id = data->p2i[pid][j]; if(param->Z[data->a2u[id]]) tmp_lambda = param->lambda_true[id]; else tmp_lambda = param->lambda_false[id]; prob[i] += log_gaussian(data->d[id], (tmp_lambda), prior->theta_eta[i]); } } maxl = vec_max(prob, _MAX_COMP_); for(i=0;i<_MAX_COMP_;i++) prob[i] -= maxl; for(i=0;i<_MAX_COMP_;i++) prob[i] = exp(prob[i]); prior->w_eta[pid] = ranMultinom(r, prob, _MAX_COMP_); param->eta[pid] = prior->theta_eta[prior->w_eta[pid]]; }
int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { int i; int s __attribute__((aligned(16))); const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sad; vector signed int sumdiffs; sad = (vector unsigned int)vec_splat_u32(0); for(i=0;i<16;i++) { /* Read potentially unaligned pixels into t1 and t2 */ perm1 = vec_lvsl(0, pix1); pix1v = (vector unsigned char *) pix1; perm2 = vec_lvsl(0, pix2); pix2v = (vector unsigned char *) pix2; t1 = vec_perm(pix1v[0], pix1v[1], perm1); t2 = vec_perm(pix2v[0], pix2v[1], perm2); /* Calculate a sum of abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; int s; const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm = vec_lvsl(0, pix2); vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sad; vector signed int sumdiffs; sad = (vector unsigned int)vec_splat_u32(0); for (i = 0; i < h; i++) { /* Read potentially unaligned pixels into t1 and t2 */ vector unsigned char pix2l = vec_ld( 0, pix2); vector unsigned char pix2r = vec_ld(15, pix2); t1 = vec_ld(0, pix1); t2 = vec_perm(pix2l, pix2r, perm); /* Calculate a sum of abs differences vector */ t3 = vec_max(t1, t2); t4 = vec_min(t1, t2); t5 = vec_sub(t3, t4); /* Add each 4 pixel group together and put 4 results into sad */ sad = vec_sum4s(t5, sad); pix1 += line_size; pix2 += line_size; } /* Sum up the four partial sums, and put the result into s */ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); sumdiffs = vec_splat(sumdiffs, 3); vec_ste(sumdiffs, 0, &s); return s; }
fepc_real_t folge_norm(folge_p f, folge_p g) { vec_p temp, temp1, temp2, min, max, lang, vec_1; int k, dim, size; fepc_real_t norm, diff; ASSERT(f->start->dim == g->start->dim); dim = f->start->dim; vec_1 = vec_one( dim ); min = vec_min( f->start, g->start ); temp1 = vec_add( f->start, f->lang ); temp2 = vec_add( g->start, g->lang ); temp = vec_max( temp1, temp2 ); vec_del( temp1 ); vec_del( temp2 ); lang = vec_op( 1, temp, -1, min ); vec_del( temp ); size = vec_size( lang ); norm = 0.0; for(k=0;k<size;k++) { temp = entry_one2d( k, lang ); temp1 = vec_add( temp, min ); diff = folge_glied( temp1, f ) - folge_glied( temp1, g ); norm = norm + ( diff * diff ); vec_del( temp ); vec_del( temp1 ); } vec_del( vec_1 ); vec_del( min ); vec_del( lang ); norm = sqrt(norm); return norm; }
template<> SIMD_INLINE v128_s16 InterferenceChange<false>(v128_s16 statistic, v128_s16 value, v128_s16 saturation) { return vec_max(vec_sub(statistic, value), saturation); }
void test1() { // CHECK-LABEL: define void @test1 // CHECK-LE-LABEL: define void @test1 res_vf = vec_abs(vf); // CHECK: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}}) dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_add(vd, vd); // CHECK: fadd <2 x double> // CHECK-LE: fadd <2 x double> res_vd = vec_and(vbll, vd); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> res_vd = vec_and(vd, vbll); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> res_vd = vec_and(vd, vd); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_andc(vbll, vd); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK-LE: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_andc(vd, vbll); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK-LE: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_andc(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_ceil(vd); // CHECK: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{[0-9]*}}) res_vf = vec_ceil(vf); // CHECK: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{[0-9]*}}) res_vbll = vec_cmpeq(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpeqdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpeqdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpeq(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpeqsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpeqsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmpge(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpge(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmpgt(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpgt(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmple(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmple(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmplt(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmplt(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) /* vec_cpsgn */ res_vf = vec_cpsgn(vf, vf); // CHECK: call <4 x float> @llvm.copysign.v4f32(<4 x float> %{{.+}}, <4 x float> %{{.+}}) // CHECK-LE: call <4 x float> @llvm.copysign.v4f32(<4 x float> %{{.+}}, <4 x float> %{{.+}}) res_vd = vec_cpsgn(vd, vd); // CHECK: call <2 x double> @llvm.copysign.v2f64(<2 x double> %{{.+}}, <2 x double> %{{.+}}) // CHECK-LE: call <2 x double> @llvm.copysign.v2f64(<2 x double> %{{.+}}, <2 x double> %{{.+}}) /* vec_div */ res_vsll = vec_div(vsll, vsll); // CHECK: sdiv <2 x i64> // CHECK-LE: sdiv <2 x i64> res_vull = vec_div(vull, vull); // CHECK: udiv <2 x i64> // CHECK-LE: udiv <2 x i64> res_vf = vec_div(vf, vf); // CHECK: fdiv <4 x float> // CHECK-LE: fdiv <4 x float> res_vd = vec_div(vd, vd); // CHECK: fdiv <2 x double> // CHECK-LE: fdiv <2 x double> /* vec_max */ res_vf = vec_max(vf, vf); // CHECK: @llvm.ppc.vsx.xvmaxsp // CHECK-LE: @llvm.ppc.vsx.xvmaxsp res_vd = vec_max(vd, vd); // CHECK: @llvm.ppc.vsx.xvmaxdp // CHECK-LE: @llvm.ppc.vsx.xvmaxdp res_vf = vec_vmaxfp(vf, vf); // CHECK: @llvm.ppc.vsx.xvmaxsp // CHECK-LE: @llvm.ppc.vsx.xvmaxsp /* vec_min */ res_vf = vec_min(vf, vf); // CHECK: @llvm.ppc.vsx.xvminsp // CHECK-LE: @llvm.ppc.vsx.xvminsp res_vd = vec_min(vd, vd); // CHECK: @llvm.ppc.vsx.xvmindp // CHECK-LE: @llvm.ppc.vsx.xvmindp res_vf = vec_vminfp(vf, vf); // CHECK: @llvm.ppc.vsx.xvminsp // CHECK-LE: @llvm.ppc.vsx.xvminsp res_d = __builtin_vsx_xsmaxdp(d, d); // CHECK: @llvm.ppc.vsx.xsmaxdp // CHECK-LE: @llvm.ppc.vsx.xsmaxdp res_d = __builtin_vsx_xsmindp(d, d); // CHECK: @llvm.ppc.vsx.xsmindp // CHECK-LE: @llvm.ppc.vsx.xsmindp /* vec_perm */ res_vsll = vec_perm(vsll, vsll, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_perm(vull, vull, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vbll = vec_perm(vbll, vbll, vuc); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vf = vec_round(vf); // CHECK: call <4 x float> @llvm.round.v4f32(<4 x float> // CHECK-LE: call <4 x float> @llvm.round.v4f32(<4 x float> res_vd = vec_round(vd); // CHECK: call <2 x double> @llvm.round.v2f64(<2 x double> // CHECK-LE: call <2 x double> @llvm.round.v2f64(<2 x double> res_vd = vec_perm(vd, vd, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vd = vec_splat(vd, 1); // CHECK: [[T1:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vbll = vec_splat(vbll, 1); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vsll = vec_splat(vsll, 1); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vull = vec_splat(vull, 1); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vsi = vec_pack(vsll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vui = vec_pack(vull, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vbi = vec_pack(vbll, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_vperm(vsll, vsll, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_vperm(vull, vull, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vd = vec_vperm(vd, vd, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm /* vec_vsx_ld */ res_vsi = vec_vsx_ld(0, &vsi); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vui = vec_vsx_ld(0, &vui); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vf = vec_vsx_ld (0, &vf); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vsll = vec_vsx_ld(0, &vsll); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vull = vec_vsx_ld(0, &vull); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vd = vec_vsx_ld(0, &vd); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vull = vec_vsx_ld(0, &vull); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vd = vec_vsx_ld(0, &vd); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vss = vec_vsx_ld(0, &vss); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vss = vec_vsx_ld(0, &ss); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vus = vec_vsx_ld(0, &vus); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vus = vec_vsx_ld(0, &us); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vbc = vec_vsx_ld(0, &vbc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vsc = vec_vsx_ld(0, &vsc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vuc = vec_vsx_ld(0, &vuc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vsc = vec_vsx_ld(0, &sc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vuc = vec_vsx_ld(0, &uc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x /* vec_vsx_st */ vec_vsx_st(vsi, 0, &res_vsi); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsi, 0, &res_si); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vui, 0, &res_vui); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vui, 0, &res_ui); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vf, 0, &res_vf); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsll, 0, &res_vsll); // CHECK: @llvm.ppc.vsx.stxvd2x // CHECK-LE: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vull, 0, &res_vull); // CHECK: @llvm.ppc.vsx.stxvd2x // CHECK-LE: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vd, 0, &res_vd); // CHECK: @llvm.ppc.vsx.stxvd2x // CHECK-LE: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vss, 0, &res_vss); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vss, 0, &res_ss); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vus, 0, &res_vus); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vus, 0, &res_us); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsc, 0, &res_vsc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsc, 0, &res_sc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vuc, 0, &res_vuc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vuc, 0, &res_uc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vbc, 0, &res_vbc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vbc, 0, &res_sc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vbc, 0, &res_uc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x /* vec_and */ res_vsll = vec_and(vsll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_and(vbll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_and(vsll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_and(vull, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_and(vbll, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_and(vull, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vbll = vec_and(vbll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> /* vec_vand */ res_vsll = vec_vand(vsll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_vand(vbll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_vand(vsll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_vand(vull, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_vand(vbll, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_vand(vull, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vbll = vec_vand(vbll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> /* vec_andc */ res_vsll = vec_andc(vsll, vsll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_andc(vbll, vsll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_andc(vsll, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_andc(vull, vull); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_andc(vbll, vull); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_andc(vull, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vbll = vec_andc(vbll, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vf = vec_floor(vf); // CHECK: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_floor(vd); // CHECK: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_madd(vf, vf, vf); // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) res_vd = vec_madd(vd, vd, vd); // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) /* vec_mergeh */ res_vsll = vec_mergeh(vsll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergeh(vsll, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergeh(vbll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergeh(vull, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergeh(vull, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergeh(vbll, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm /* vec_mergel */ res_vsll = vec_mergel(vsll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergel(vsll, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergel(vbll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergel(vull, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergel(vull, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergel(vbll, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm /* vec_msub */ res_vf = vec_msub(vf, vf, vf); // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> // CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> res_vd = vec_msub(vd, vd, vd); // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> // CHECK-LE: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> res_vsll = vec_mul(vsll, vsll); // CHECK: mul <2 x i64> // CHECK-LE: mul <2 x i64> res_vull = vec_mul(vull, vull); // CHECK: mul <2 x i64> // CHECK-LE: mul <2 x i64> res_vf = vec_mul(vf, vf); // CHECK: fmul <4 x float> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: fmul <4 x float> %{{[0-9]+}}, %{{[0-9]+}} res_vd = vec_mul(vd, vd); // CHECK: fmul <2 x double> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: fmul <2 x double> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_nearbyint(vf); // CHECK: call <4 x float> @llvm.round.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.round.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_nearbyint(vd); // CHECK: call <2 x double> @llvm.round.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.round.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_nmadd(vf, vf, vf); // CHECK: [[FM:[0-9]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) // CHECK-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[FM]] // CHECK-LE: [[FM:[0-9]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) // CHECK-LE-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[FM]] res_vd = vec_nmadd(vd, vd, vd); // CHECK: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) // CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] // CHECK-LE: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) // CHECK-LE-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] res_vf = vec_nmsub(vf, vf, vf); // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> // CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} res_vd = vec_nmsub(vd, vd, vd); // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> // CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] // CHECK-LE: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> // CHECK-LE-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] /* vec_nor */ res_vsll = vec_nor(vsll, vsll); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> // CHECK-LE: or <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_nor(vull, vull); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> // CHECK-LE: or <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_nor(vbll, vbll); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> // CHECK-LE: or <2 x i64> // CHECK-LE: xor <2 x i64> res_vd = vec_nor(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: [[OR:%.+]] = or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-NEXT: xor <2 x i64> [[OR]], <i64 -1, i64 -1> // CHECK-LE: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: [[OR:%.+]] = or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE-NEXT: xor <2 x i64> [[OR]], <i64 -1, i64 -1> /* vec_or */ res_vsll = vec_or(vsll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_or(vbll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_or(vsll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_or(vull, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_or(vbll, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_or(vull, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vbll = vec_or(vbll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vd = vec_or(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} res_vd = vec_or(vbll, vd); // CHECK: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: [[T2:%.+]] = or <2 x i64> %{{[0-9]+}}, [[T1]] // CHECK: bitcast <2 x i64> [[T2]] to <2 x double> // CHECK-LE: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: [[T2:%.+]] = or <2 x i64> %{{[0-9]+}}, [[T1]] // CHECK-LE: bitcast <2 x i64> [[T2]] to <2 x double> res_vd = vec_or(vd, vbll); // CHECK: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: [[T2:%.+]] = or <2 x i64> [[T1]], %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[T2]] to <2 x double> // CHECK-LE: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: [[T2:%.+]] = or <2 x i64> [[T1]], %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[T2]] to <2 x double> res_vf = vec_re(vf); // CHECK: call <4 x float> @llvm.ppc.vsx.xvresp(<4 x float> // CHECK-LE: call <4 x float> @llvm.ppc.vsx.xvresp(<4 x float> res_vd = vec_re(vd); // CHECK: call <2 x double> @llvm.ppc.vsx.xvredp(<2 x double> // CHECK-LE: call <2 x double> @llvm.ppc.vsx.xvredp(<2 x double> res_vf = vec_rint(vf); // CHECK: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_rint(vd); // CHECK: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_rsqrte(vf); // CHECK: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}}) res_vd = vec_rsqrte(vd); // CHECK: call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %{{[0-9]+}}) dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vf = vec_sel(vd, vd, vbll); // CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> %{{[0-9]+}}, // CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: or <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> // CHECK-LE: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> %{{[0-9]+}}, // CHECK-LE: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: or <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_sel(vd, vd, vull); // CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> %{{[0-9]+}}, // CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: or <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> // CHECK-LE: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> %{{[0-9]+}}, // CHECK-LE: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: or <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> res_vf = vec_sqrt(vf); // CHECK: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_sqrt(vd); // CHECK: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{[0-9]+}}) res_vd = vec_sub(vd, vd); // CHECK: fsub <2 x double> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: fsub <2 x double> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_trunc(vf); // CHECK: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_trunc(vd); // CHECK: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{[0-9]+}}) /* vec_vor */ res_vsll = vec_vor(vsll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_vor(vbll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_vor(vsll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_vor(vull, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_vor(vbll, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_vor(vull, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vbll = vec_vor(vbll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> /* vec_xor */ res_vsll = vec_xor(vsll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_xor(vbll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_xor(vsll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_xor(vull, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_xor(vbll, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_xor(vull, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vbll = vec_xor(vbll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_xor(vd, vd); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> // CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_xor(vd, vbll); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> // CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_xor(vbll, vd); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> // CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double> /* vec_vxor */ res_vsll = vec_vxor(vsll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_vxor(vbll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_vxor(vsll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_vxor(vull, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_vxor(vbll, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_vxor(vull, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vbll = vec_vxor(vbll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_cts(vd, 0); // CHECK: fmul <2 x double> // CHECK: fptosi <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptosi <2 x double> %{{.*}} to <2 x i64> res_vsll = vec_cts(vd, 31); // CHECK: fmul <2 x double> // CHECK: fptosi <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptosi <2 x double> %{{.*}} to <2 x i64> res_vsll = vec_ctu(vd, 0); // CHECK: fmul <2 x double> // CHECK: fptoui <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptoui <2 x double> %{{.*}} to <2 x i64> res_vsll = vec_ctu(vd, 31); // CHECK: fmul <2 x double> // CHECK: fptoui <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptoui <2 x double> %{{.*}} to <2 x i64> res_vd = vec_ctf(vsll, 0); // CHECK: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> res_vd = vec_ctf(vsll, 31); // CHECK: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> res_vd = vec_ctf(vull, 0); // CHECK: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> res_vd = vec_ctf(vull, 31); // CHECK: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> }
/* Function: p7_ViterbiFilter() * Synopsis: Calculates Viterbi score, vewy vewy fast, in limited precision. * Incept: SRE, Tue Nov 27 09:15:24 2007 [Janelia] * * Purpose: Calculates an approximation of the Viterbi score for sequence * <dsq> of length <L> residues, using optimized profile <om>, * and a preallocated one-row DP matrix <ox>. Return the * estimated Viterbi score (in nats) in <ret_sc>. * * Score may overflow (and will, on high-scoring * sequences), but will not underflow. * * The model must be in a local alignment mode; other modes * cannot provide the necessary guarantee of no underflow. * * This is a striped SIMD Viterbi implementation using Intel * VMX integer intrinsics \citep{Farrar07}, in reduced * precision (signed words, 16 bits). * * Args: dsq - digital target sequence, 1..L * L - length of dsq in residues * om - optimized profile * ox - DP matrix * ret_sc - RETURN: Viterbi score (in nats) * * Returns: <eslOK> on success; * <eslERANGE> if the score overflows; in this case * <*ret_sc> is <eslINFINITY>, and the sequence can * be treated as a high-scoring hit. * * Throws: <eslEINVAL> if <ox> allocation is too small, or if * profile isn't in a local alignment mode. (Must be in local * alignment mode because that's what helps us guarantee * limited dynamic range.) * * Xref: [Farrar07] for ideas behind striped SIMD DP. * J2/46-47 for layout of HMMER's striped SIMD DP. * J2/50 for single row DP. * J2/60 for reduced precision (epu8) * J2/65 for initial benchmarking * J2/66 for precision maximization * J4/138-140 for reimplementation in 16-bit precision */ int p7_ViterbiFilter(const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_OMX *ox, float *ret_sc) { vector signed short mpv, dpv, ipv; /* previous row values */ vector signed short sv; /* temp storage of 1 curr row value in progress */ vector signed short dcv; /* delayed storage of D(i,q+1) */ vector signed short xEv; /* E state: keeps max for Mk->E as we go */ vector signed short xBv; /* B state: splatted vector of B[i-1] for B->Mk calculations */ vector signed short Dmaxv; /* keeps track of maximum D cell on row */ int16_t xE, xB, xC, xJ, xN; /* special states' scores */ int16_t Dmax; /* maximum D cell score on row */ int i; /* counter over sequence positions 1..L */ int q; /* counter over vectors 0..nq-1 */ int Q; /* segment length: # of vectors */ vector signed short *dp; /* using {MDI}MX(q) macro requires initialization of <dp> */ vector signed short *rsc; /* will point at om->ru[x] for residue x[i] */ vector signed short *tsc; /* will point into (and step thru) om->tu */ vector signed short negInfv; Q = p7O_NQW(om->M); dp = ox->dpw[0]; /* Check that the DP matrix is ok for us. */ if (Q > ox->allocQ8) ESL_EXCEPTION(eslEINVAL, "DP matrix allocated too small"); if (om->mode != p7_LOCAL && om->mode != p7_UNILOCAL) ESL_EXCEPTION(eslEINVAL, "Fast filter only works for local alignment"); ox->M = om->M; negInfv = esl_vmx_set_s16((signed short)-32768); /* Initialization. In unsigned arithmetic, -infinity is -32768 */ for (q = 0; q < Q; q++) MMXo(q) = IMXo(q) = DMXo(q) = negInfv; xN = om->base_w; xB = xN + om->xw[p7O_N][p7O_MOVE]; xJ = -32768; xC = -32768; xE = -32768; #if p7_DEBUGGING if (ox->debugging) p7_omx_DumpVFRow(ox, 0, xE, 0, xJ, xB, xC); /* first 0 is <rowi>: do header. second 0 is xN: always 0 here. */ #endif for (i = 1; i <= L; i++) { rsc = om->rwv[dsq[i]]; tsc = om->twv; dcv = negInfv; /* "-infinity" */ xEv = negInfv; Dmaxv = negInfv; xBv = esl_vmx_set_s16(xB); /* Right shifts by 1 value (2 bytes). 4,8,12,x becomes x,4,8,12. * Because ia32 is littlendian, this means a left bit shift. * Zeros shift on automatically; replace it with -32768. */ mpv = MMXo(Q-1); mpv = vec_sld(negInfv, mpv, 14); dpv = DMXo(Q-1); dpv = vec_sld(negInfv, dpv, 14); ipv = IMXo(Q-1); ipv = vec_sld(negInfv, ipv, 14); for (q = 0; q < Q; q++) { /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */ sv = vec_adds(xBv, *tsc); tsc++; sv = vec_max (sv, vec_adds(mpv, *tsc)); tsc++; sv = vec_max (sv, vec_adds(ipv, *tsc)); tsc++; sv = vec_max (sv, vec_adds(dpv, *tsc)); tsc++; sv = vec_adds(sv, *rsc); rsc++; xEv = vec_max(xEv, sv); /* Load {MDI}(i-1,q) into mpv, dpv, ipv; * {MDI}MX(q) is then the current, not the prev row */ mpv = MMXo(q); dpv = DMXo(q); ipv = IMXo(q); /* Do the delayed stores of {MD}(i,q) now that memory is usable */ MMXo(q) = sv; DMXo(q) = dcv; /* Calculate the next D(i,q+1) partially: M->D only; * delay storage, holding it in dcv */ dcv = vec_adds(sv, *tsc); tsc++; Dmaxv = vec_max(dcv, Dmaxv); /* Calculate and store I(i,q) */ sv = vec_adds(mpv, *tsc); tsc++; IMXo(q)= vec_max(sv, vec_adds(ipv, *tsc)); tsc++; } /* Now the "special" states, which start from Mk->E (->C, ->J->B) */ xE = esl_vmx_hmax_s16(xEv); if (xE >= 32767) { *ret_sc = eslINFINITY; return eslERANGE; } /* immediately detect overflow */ xN = xN + om->xw[p7O_N][p7O_LOOP]; xC = ESL_MAX(xC + om->xw[p7O_C][p7O_LOOP], xE + om->xw[p7O_E][p7O_MOVE]); xJ = ESL_MAX(xJ + om->xw[p7O_J][p7O_LOOP], xE + om->xw[p7O_E][p7O_LOOP]); xB = ESL_MAX(xJ + om->xw[p7O_J][p7O_MOVE], xN + om->xw[p7O_N][p7O_MOVE]); /* and now xB will carry over into next i, and xC carries over after i=L */ /* Finally the "lazy F" loop (sensu [Farrar07]). We can often * prove that we don't need to evaluate any D->D paths at all. * * The observation is that if we can show that on the next row, * B->M(i+1,k) paths always dominate M->D->...->D->M(i+1,k) paths * for all k, then we don't need any D->D calculations. * * The test condition is: * max_k D(i,k) + max_k ( TDD(k-2) + TDM(k-1) - TBM(k) ) < xB(i) * So: * max_k (TDD(k-2) + TDM(k-1) - TBM(k)) is precalc'ed in om->dd_bound; * max_k D(i,k) is why we tracked Dmaxv; * xB(i) was just calculated above. */ Dmax = esl_vmx_hmax_s16(Dmaxv); if (Dmax + om->ddbound_w > xB) { /* Now we're obligated to do at least one complete DD path to be sure. */ /* dcv has carried through from end of q loop above */ dcv = vec_sld(negInfv, dcv, 14); tsc = om->twv + 7*Q; /* set tsc to start of the DD's */ for (q = 0; q < Q; q++) { DMXo(q) = vec_max(dcv, DMXo(q)); dcv = vec_adds(DMXo(q), *tsc); tsc++; } /* We may have to do up to three more passes; the check * is for whether crossing a segment boundary can improve * our score. */ do { dcv = vec_sld(negInfv, dcv, 14); tsc = om->twv + 7*Q; /* set tsc to start of the DD's */ for (q = 0; q < Q; q++) { if (! vec_any_gt(dcv, DMXo(q))) break; DMXo(q) = vec_max(dcv, DMXo(q)); dcv = vec_adds(DMXo(q), *tsc); tsc++; } } while (q == Q); } else /* not calculating DD? then just store the last M->D vector calc'ed.*/ DMXo(0) = vec_sld(negInfv, dcv, 14); #if p7_DEBUGGING if (ox->debugging) p7_omx_DumpVFRow(ox, i, xE, 0, xJ, xB, xC); #endif } /* end loop over sequence residues 1..L */ /* finally C->T */ if (xC > -32768) { *ret_sc = (float) xC + (float) om->xw[p7O_C][p7O_MOVE] - (float) om->base_w; /* *ret_sc += L * om->ncj_roundoff; see J4/150 for rationale: superceded by -3.0nat approximation*/ *ret_sc /= om->scale_w; *ret_sc -= 3.0; /* the NN/CC/JJ=0,-3nat approximation: see J5/36. That's ~ L \log \frac{L}{L+3}, for our NN,CC,JJ contrib */ } else *ret_sc = -eslINFINITY; return eslOK; }
static int dct_quantize_altivec(MpegEncContext* s, DCTELEM* data, int n, int qscale, int* overflow) { int lastNonZero; vector float row0, row1, row2, row3, row4, row5, row6, row7; vector float alt0, alt1, alt2, alt3, alt4, alt5, alt6, alt7; const vector float zero = (const vector float)FOUROF(0.); // used after quantize step int oldBaseValue = 0; // Load the data into the row/alt vectors { vector signed short data0, data1, data2, data3, data4, data5, data6, data7; data0 = vec_ld(0, data); data1 = vec_ld(16, data); data2 = vec_ld(32, data); data3 = vec_ld(48, data); data4 = vec_ld(64, data); data5 = vec_ld(80, data); data6 = vec_ld(96, data); data7 = vec_ld(112, data); // Transpose the data before we start TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7); // load the data into floating point vectors. We load // the high half of each row into the main row vectors // and the low half into the alt vectors. row0 = vec_ctf(vec_unpackh(data0), 0); alt0 = vec_ctf(vec_unpackl(data0), 0); row1 = vec_ctf(vec_unpackh(data1), 0); alt1 = vec_ctf(vec_unpackl(data1), 0); row2 = vec_ctf(vec_unpackh(data2), 0); alt2 = vec_ctf(vec_unpackl(data2), 0); row3 = vec_ctf(vec_unpackh(data3), 0); alt3 = vec_ctf(vec_unpackl(data3), 0); row4 = vec_ctf(vec_unpackh(data4), 0); alt4 = vec_ctf(vec_unpackl(data4), 0); row5 = vec_ctf(vec_unpackh(data5), 0); alt5 = vec_ctf(vec_unpackl(data5), 0); row6 = vec_ctf(vec_unpackh(data6), 0); alt6 = vec_ctf(vec_unpackl(data6), 0); row7 = vec_ctf(vec_unpackh(data7), 0); alt7 = vec_ctf(vec_unpackl(data7), 0); } // The following block could exist as a separate an altivec dct // function. However, if we put it inline, the DCT data can remain // in the vector local variables, as floats, which we'll use during the // quantize step... { const vector float vec_0_298631336 = (vector float)FOUROF(0.298631336f); const vector float vec_0_390180644 = (vector float)FOUROF(-0.390180644f); const vector float vec_0_541196100 = (vector float)FOUROF(0.541196100f); const vector float vec_0_765366865 = (vector float)FOUROF(0.765366865f); const vector float vec_0_899976223 = (vector float)FOUROF(-0.899976223f); const vector float vec_1_175875602 = (vector float)FOUROF(1.175875602f); const vector float vec_1_501321110 = (vector float)FOUROF(1.501321110f); const vector float vec_1_847759065 = (vector float)FOUROF(-1.847759065f); const vector float vec_1_961570560 = (vector float)FOUROF(-1.961570560f); const vector float vec_2_053119869 = (vector float)FOUROF(2.053119869f); const vector float vec_2_562915447 = (vector float)FOUROF(-2.562915447f); const vector float vec_3_072711026 = (vector float)FOUROF(3.072711026f); int whichPass, whichHalf; for(whichPass = 1; whichPass<=2; whichPass++) { for(whichHalf = 1; whichHalf<=2; whichHalf++) { vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; vector float tmp10, tmp11, tmp12, tmp13; vector float z1, z2, z3, z4, z5; tmp0 = vec_add(row0, row7); // tmp0 = dataptr[0] + dataptr[7]; tmp7 = vec_sub(row0, row7); // tmp7 = dataptr[0] - dataptr[7]; tmp3 = vec_add(row3, row4); // tmp3 = dataptr[3] + dataptr[4]; tmp4 = vec_sub(row3, row4); // tmp4 = dataptr[3] - dataptr[4]; tmp1 = vec_add(row1, row6); // tmp1 = dataptr[1] + dataptr[6]; tmp6 = vec_sub(row1, row6); // tmp6 = dataptr[1] - dataptr[6]; tmp2 = vec_add(row2, row5); // tmp2 = dataptr[2] + dataptr[5]; tmp5 = vec_sub(row2, row5); // tmp5 = dataptr[2] - dataptr[5]; tmp10 = vec_add(tmp0, tmp3); // tmp10 = tmp0 + tmp3; tmp13 = vec_sub(tmp0, tmp3); // tmp13 = tmp0 - tmp3; tmp11 = vec_add(tmp1, tmp2); // tmp11 = tmp1 + tmp2; tmp12 = vec_sub(tmp1, tmp2); // tmp12 = tmp1 - tmp2; // dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS); row0 = vec_add(tmp10, tmp11); // dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS); row4 = vec_sub(tmp10, tmp11); // z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); z1 = vec_madd(vec_add(tmp12, tmp13), vec_0_541196100, (vector float)zero); // dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865), // CONST_BITS-PASS1_BITS); row2 = vec_madd(tmp13, vec_0_765366865, z1); // dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065), // CONST_BITS-PASS1_BITS); row6 = vec_madd(tmp12, vec_1_847759065, z1); z1 = vec_add(tmp4, tmp7); // z1 = tmp4 + tmp7; z2 = vec_add(tmp5, tmp6); // z2 = tmp5 + tmp6; z3 = vec_add(tmp4, tmp6); // z3 = tmp4 + tmp6; z4 = vec_add(tmp5, tmp7); // z4 = tmp5 + tmp7; // z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ z5 = vec_madd(vec_add(z3, z4), vec_1_175875602, (vector float)zero); // z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ z3 = vec_madd(z3, vec_1_961570560, z5); // z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ z4 = vec_madd(z4, vec_0_390180644, z5); // The following adds are rolled into the multiplies above // z3 = vec_add(z3, z5); // z3 += z5; // z4 = vec_add(z4, z5); // z4 += z5; // z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ // Wow! It's actually more efficient to roll this multiply // into the adds below, even thought the multiply gets done twice! // z2 = vec_madd(z2, vec_2_562915447, (vector float)zero); // z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ // Same with this one... // z1 = vec_madd(z1, vec_0_899976223, (vector float)zero); // tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ // dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); row7 = vec_madd(tmp4, vec_0_298631336, vec_madd(z1, vec_0_899976223, z3)); // tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ // dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); row5 = vec_madd(tmp5, vec_2_053119869, vec_madd(z2, vec_2_562915447, z4)); // tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ // dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); row3 = vec_madd(tmp6, vec_3_072711026, vec_madd(z2, vec_2_562915447, z3)); // tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ // dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); row1 = vec_madd(z1, vec_0_899976223, vec_madd(tmp7, vec_1_501321110, z4)); // Swap the row values with the alts. If this is the first half, // this sets up the low values to be acted on in the second half. // If this is the second half, it puts the high values back in // the row values where they are expected to be when we're done. SWAP(row0, alt0); SWAP(row1, alt1); SWAP(row2, alt2); SWAP(row3, alt3); SWAP(row4, alt4); SWAP(row5, alt5); SWAP(row6, alt6); SWAP(row7, alt7); } if (whichPass == 1) { // transpose the data for the second pass // First, block transpose the upper right with lower left. SWAP(row4, alt0); SWAP(row5, alt1); SWAP(row6, alt2); SWAP(row7, alt3); // Now, transpose each block of four TRANSPOSE4(row0, row1, row2, row3); TRANSPOSE4(row4, row5, row6, row7); TRANSPOSE4(alt0, alt1, alt2, alt3); TRANSPOSE4(alt4, alt5, alt6, alt7); } } } // perform the quantize step, using the floating point data // still in the row/alt registers { const int* biasAddr; const vector signed int* qmat; vector float bias, negBias; if (s->mb_intra) { vector signed int baseVector; // We must cache element 0 in the intra case // (it needs special handling). baseVector = vec_cts(vec_splat(row0, 0), 0); vec_ste(baseVector, 0, &oldBaseValue); qmat = (vector signed int*)s->q_intra_matrix[qscale]; biasAddr = &(s->intra_quant_bias); } else { qmat = (vector signed int*)s->q_inter_matrix[qscale]; biasAddr = &(s->inter_quant_bias); } // Load the bias vector (We add 0.5 to the bias so that we're // rounding when we convert to int, instead of flooring.) { vector signed int biasInt; const vector float negOneFloat = (vector float)FOUROF(-1.0f); LOAD4(biasInt, biasAddr); bias = vec_ctf(biasInt, QUANT_BIAS_SHIFT); negBias = vec_madd(bias, negOneFloat, zero); } { vector float q0, q1, q2, q3, q4, q5, q6, q7; q0 = vec_ctf(qmat[0], QMAT_SHIFT); q1 = vec_ctf(qmat[2], QMAT_SHIFT); q2 = vec_ctf(qmat[4], QMAT_SHIFT); q3 = vec_ctf(qmat[6], QMAT_SHIFT); q4 = vec_ctf(qmat[8], QMAT_SHIFT); q5 = vec_ctf(qmat[10], QMAT_SHIFT); q6 = vec_ctf(qmat[12], QMAT_SHIFT); q7 = vec_ctf(qmat[14], QMAT_SHIFT); row0 = vec_sel(vec_madd(row0, q0, negBias), vec_madd(row0, q0, bias), vec_cmpgt(row0, zero)); row1 = vec_sel(vec_madd(row1, q1, negBias), vec_madd(row1, q1, bias), vec_cmpgt(row1, zero)); row2 = vec_sel(vec_madd(row2, q2, negBias), vec_madd(row2, q2, bias), vec_cmpgt(row2, zero)); row3 = vec_sel(vec_madd(row3, q3, negBias), vec_madd(row3, q3, bias), vec_cmpgt(row3, zero)); row4 = vec_sel(vec_madd(row4, q4, negBias), vec_madd(row4, q4, bias), vec_cmpgt(row4, zero)); row5 = vec_sel(vec_madd(row5, q5, negBias), vec_madd(row5, q5, bias), vec_cmpgt(row5, zero)); row6 = vec_sel(vec_madd(row6, q6, negBias), vec_madd(row6, q6, bias), vec_cmpgt(row6, zero)); row7 = vec_sel(vec_madd(row7, q7, negBias), vec_madd(row7, q7, bias), vec_cmpgt(row7, zero)); q0 = vec_ctf(qmat[1], QMAT_SHIFT); q1 = vec_ctf(qmat[3], QMAT_SHIFT); q2 = vec_ctf(qmat[5], QMAT_SHIFT); q3 = vec_ctf(qmat[7], QMAT_SHIFT); q4 = vec_ctf(qmat[9], QMAT_SHIFT); q5 = vec_ctf(qmat[11], QMAT_SHIFT); q6 = vec_ctf(qmat[13], QMAT_SHIFT); q7 = vec_ctf(qmat[15], QMAT_SHIFT); alt0 = vec_sel(vec_madd(alt0, q0, negBias), vec_madd(alt0, q0, bias), vec_cmpgt(alt0, zero)); alt1 = vec_sel(vec_madd(alt1, q1, negBias), vec_madd(alt1, q1, bias), vec_cmpgt(alt1, zero)); alt2 = vec_sel(vec_madd(alt2, q2, negBias), vec_madd(alt2, q2, bias), vec_cmpgt(alt2, zero)); alt3 = vec_sel(vec_madd(alt3, q3, negBias), vec_madd(alt3, q3, bias), vec_cmpgt(alt3, zero)); alt4 = vec_sel(vec_madd(alt4, q4, negBias), vec_madd(alt4, q4, bias), vec_cmpgt(alt4, zero)); alt5 = vec_sel(vec_madd(alt5, q5, negBias), vec_madd(alt5, q5, bias), vec_cmpgt(alt5, zero)); alt6 = vec_sel(vec_madd(alt6, q6, negBias), vec_madd(alt6, q6, bias), vec_cmpgt(alt6, zero)); alt7 = vec_sel(vec_madd(alt7, q7, negBias), vec_madd(alt7, q7, bias), vec_cmpgt(alt7, zero)); } } // Store the data back into the original block { vector signed short data0, data1, data2, data3, data4, data5, data6, data7; data0 = vec_pack(vec_cts(row0, 0), vec_cts(alt0, 0)); data1 = vec_pack(vec_cts(row1, 0), vec_cts(alt1, 0)); data2 = vec_pack(vec_cts(row2, 0), vec_cts(alt2, 0)); data3 = vec_pack(vec_cts(row3, 0), vec_cts(alt3, 0)); data4 = vec_pack(vec_cts(row4, 0), vec_cts(alt4, 0)); data5 = vec_pack(vec_cts(row5, 0), vec_cts(alt5, 0)); data6 = vec_pack(vec_cts(row6, 0), vec_cts(alt6, 0)); data7 = vec_pack(vec_cts(row7, 0), vec_cts(alt7, 0)); { // Clamp for overflow vector signed int max_q_int, min_q_int; vector signed short max_q, min_q; LOAD4(max_q_int, &(s->max_qcoeff)); LOAD4(min_q_int, &(s->min_qcoeff)); max_q = vec_pack(max_q_int, max_q_int); min_q = vec_pack(min_q_int, min_q_int); data0 = vec_max(vec_min(data0, max_q), min_q); data1 = vec_max(vec_min(data1, max_q), min_q); data2 = vec_max(vec_min(data2, max_q), min_q); data4 = vec_max(vec_min(data4, max_q), min_q); data5 = vec_max(vec_min(data5, max_q), min_q); data6 = vec_max(vec_min(data6, max_q), min_q); data7 = vec_max(vec_min(data7, max_q), min_q); } { vector bool char zero_01, zero_23, zero_45, zero_67; vector signed char scanIndexes_01, scanIndexes_23, scanIndexes_45, scanIndexes_67; vector signed char negOne = vec_splat_s8(-1); vector signed char* scanPtr = (vector signed char*)(s->intra_scantable.inverse); signed char lastNonZeroChar; // Determine the largest non-zero index. zero_01 = vec_pack(vec_cmpeq(data0, (vector signed short)zero), vec_cmpeq(data1, (vector signed short)zero)); zero_23 = vec_pack(vec_cmpeq(data2, (vector signed short)zero), vec_cmpeq(data3, (vector signed short)zero)); zero_45 = vec_pack(vec_cmpeq(data4, (vector signed short)zero), vec_cmpeq(data5, (vector signed short)zero)); zero_67 = vec_pack(vec_cmpeq(data6, (vector signed short)zero), vec_cmpeq(data7, (vector signed short)zero)); // 64 biggest values scanIndexes_01 = vec_sel(scanPtr[0], negOne, zero_01); scanIndexes_23 = vec_sel(scanPtr[1], negOne, zero_23); scanIndexes_45 = vec_sel(scanPtr[2], negOne, zero_45); scanIndexes_67 = vec_sel(scanPtr[3], negOne, zero_67); // 32 largest values scanIndexes_01 = vec_max(scanIndexes_01, scanIndexes_23); scanIndexes_45 = vec_max(scanIndexes_45, scanIndexes_67); // 16 largest values scanIndexes_01 = vec_max(scanIndexes_01, scanIndexes_45); // 8 largest values scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne), vec_mergel(scanIndexes_01, negOne)); // 4 largest values scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne), vec_mergel(scanIndexes_01, negOne)); // 2 largest values scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne), vec_mergel(scanIndexes_01, negOne)); // largest value scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne), vec_mergel(scanIndexes_01, negOne)); scanIndexes_01 = vec_splat(scanIndexes_01, 0); vec_ste(scanIndexes_01, 0, &lastNonZeroChar); lastNonZero = lastNonZeroChar; // While the data is still in vectors we check for the transpose IDCT permute // and handle it using the vector unit if we can. This is the permute used // by the altivec idct, so it is common when using the altivec dct. if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM)) { TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7); } vec_st(data0, 0, data); vec_st(data1, 16, data); vec_st(data2, 32, data); vec_st(data3, 48, data); vec_st(data4, 64, data); vec_st(data5, 80, data); vec_st(data6, 96, data); vec_st(data7, 112, data); } } // special handling of block[0] if (s->mb_intra) { if (!s->h263_aic) { if (n < 4) oldBaseValue /= s->y_dc_scale; else oldBaseValue /= s->c_dc_scale; } // Divide by 8, rounding the result data[0] = (oldBaseValue + 4) >> 3; } // We handled the transpose permutation above and we don't // need to permute the "no" permutation case. if ((lastNonZero > 0) && (s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) && (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)) { ff_block_permute(data, s->dsp.idct_permutation, s->intra_scantable.scantable, lastNonZero); } return lastNonZero; }