void check_special_values() { V4SF vx; vx.f[0] = -1000; vx.f[1] = -100; vx.f[2] = 100; vx.f[3] = 1000; printf("exp("); print4(vx.v); printf(") = "); print4(exp_ps(vx.v)); printf("\n"); vx.f[0] = QNAN.f; vx.f[1] = PINF.f; vx.f[2] = MINF.f; vx.f[3] = QNAN2.f; printf("exp("); print4(vx.v); printf(") = "); print4(exp_ps(vx.v)); printf("\n"); vx.f[0] = 0; vx.f[1] = -10; vx.f[2] = 1e30f; vx.f[3] = 1e-42f; printf("log("); print4(vx.v); printf(") = "); print4(log_ps(vx.v)); printf("\n"); vx.f[0] = QNAN.f; vx.f[1] = PINF.f; vx.f[2] = MINF.f; vx.f[3] = QNAN2.f; printf("log("); print4(vx.v); printf(") = "); print4(log_ps(vx.v)); printf("\n"); printf("sin("); print4(vx.v); printf(") = "); print4(sin_ps(vx.v)); printf("\n"); printf("cos("); print4(vx.v); printf(") = "); print4(cos_ps(vx.v)); printf("\n"); vx.f[0] = -1e30; vx.f[1] = -100000; vx.f[2] = 1e30; vx.f[3] = 100000; printf("sin("); print4(vx.v); printf(") = "); print4(sin_ps(vx.v)); printf("\n"); printf("cos("); print4(vx.v); printf(") = "); print4(cos_ps(vx.v)); printf("\n"); }
// SIMD exp __SIMD _SIMD_exp_ps(__SIMD a) { #ifdef USE_SSE return exp_ps(a); #elif defined USE_AVX return exp256_ps(a); #endif }
void fast(element_t * const elements, const int num_elts, const float a) { element_t * elts = elements; float logf_a = logf(a); float logf_1_a = logf(1.0/a); v4sf log_a = _mm_load1_ps(&logf_a); v4sf log_1_a = _mm_load1_ps(&logf_1_a); assert(num_elts % 3 == 0); // operates on 3 elements at a time // elts->re = powf((powf(elts->x, a) + powf(elts->y, a) + powf(elts->z, a)), 1.0/a); for (int i = 0; i < num_elts; i += 3) { // transpose // we save one operation over _MM_TRANSPOSE4_PS by skipping the last row of output v4sf r0 = _mm_load_ps(&elts[0].x); // x1,y1,z1,0 v4sf r1 = _mm_load_ps(&elts[1].x); // x2,y2,z2,0 v4sf r2 = _mm_load_ps(&elts[2].x); // x3,y3,z3,0 v4sf r3 = _mm_setzero_ps(); // 0, 0, 0, 0 v4sf t0 = _mm_unpacklo_ps(r0, r1); // x1,x2,y1,y2 v4sf t1 = _mm_unpacklo_ps(r2, r3); // x3,0, y3,0 v4sf t2 = _mm_unpackhi_ps(r0, r1); // z1,z2,0, 0 v4sf t3 = _mm_unpackhi_ps(r2, r3); // z3,0, 0, 0 r0 = _mm_movelh_ps(t0, t1); // x1,x2,x3,0 r1 = _mm_movehl_ps(t1, t0); // y1,y2,y3,0 r2 = _mm_movelh_ps(t2, t3); // z1,z2,z3,0 // perform pow(x,a),.. using the fact that pow(x,a) = exp(x * log(a)) v4sf r0a = _mm_mul_ps(r0, log_a); // x1*log(a), x2*log(a), x3*log(a), 0 v4sf r1a = _mm_mul_ps(r1, log_a); // y1*log(a), y2*log(a), y3*log(a), 0 v4sf r2a = _mm_mul_ps(r2, log_a); // z1*log(a), z2*log(a), z3*log(a), 0 v4sf ex0 = exp_ps(r0a); // pow(x1, a), ..., 0 v4sf ex1 = exp_ps(r1a); // pow(y1, a), ..., 0 v4sf ex2 = exp_ps(r2a); // pow(z1, a), ..., 0 // sum v4sf s1 = _mm_add_ps(ex0, ex1); v4sf s2 = _mm_add_ps(sum1, ex2); // pow(sum, 1/a) = exp(sum * log(1/a)) v4sf ps = _mm_mul_ps(s2, log_1_a); v4sf es = exp_ps(ps); ALIGN16_BEG float re[4] ALIGN16_END; _mm_store_ps(re, es); elts[0].re = re[0]; elts[1].re = re[1]; elts[2].re = re[2]; elts += 3; } }
int check_explog_precision(float xmin, float xmax) { unsigned nb_trials = 100000; printf("checking exp/log [%g, %g]\n", xmin, xmax); float max_err_exp_ref = 0, max_err_exp_cep = 0, max_err_exp_x = 0; float max_err_log_ref = 0, max_err_log_cep = 0, max_err_log_x = 0; float max_err_logexp_test = 0; float max_err_logexp_ref = 0; unsigned i; for (i=0; i < nb_trials; ++i) { V4SF vx, exp4, log4; vx.f[0] = frand()*(xmax-xmin)+xmin; vx.f[1] = frand()*(xmax-xmin)+xmin; vx.f[2] = frand()*(xmax-xmin)+xmin; vx.f[3] = frand()*(xmax-xmin)+xmin; exp4.v = exp_ps(vx.v); log4.v = log_ps(exp4.v); unsigned j; for (j=0; j < 4; ++j) { float x = vx.f[j]; float exp_test = exp4.f[j]; float log_test = log4.f[j]; float exp_ref = expf(x); float exp_cep = cephes_expf(x); float err_exp_ref = fabs(exp_ref - exp_test)/exp_ref; float err_exp_cep = fabs(exp_cep - exp_test)/exp_ref; if (err_exp_ref > max_err_exp_ref) { max_err_exp_ref = err_exp_ref; max_err_exp_x = x; } max_err_exp_cep = MAX(max_err_exp_cep, err_exp_cep); float log_ref = logf(exp_test); float log_cep = cephes_logf(exp_test); float err_log_ref = fabs(log_ref - log_test); float err_log_cep = fabs(log_cep - log_test); if (err_log_ref > max_err_log_ref) { max_err_log_ref = err_log_ref; max_err_log_x = x; } max_err_log_cep = MAX(max_err_log_cep, err_log_cep); float err_logexp_test = fabs(x - log_test); float err_logexp_ref = fabs(x - logf(expf(x))); max_err_logexp_ref = MAX(max_err_logexp_ref, err_logexp_ref); max_err_logexp_test = MAX(max_err_logexp_test, err_logexp_test); } } printf("max (relative) deviation from expf(x): %g at %14.12g, max deviation from cephes_expf(x): %g\n", max_err_exp_ref, max_err_exp_x, max_err_exp_cep); printf("max (absolute) deviation from logf(x): %g at %14.12g, max deviation from cephes_logf(x): %g\n", max_err_log_ref, max_err_log_x, max_err_log_cep); printf("deviation of x - log(exp(x)): %g (ref deviation is %g)\n", max_err_logexp_test, max_err_logexp_ref); if (max_err_logexp_test < 2e-7 && max_err_exp_ref < 2e-7 && max_err_log_ref < 2e-7) { printf(" ->> precision OK for the exp_ps / log_ps <<-\n\n"); return 0; } else { printf("\n WRONG PRECISION !! there is a problem\n\n"); return 1; } }
/** * Calculate all values in one step per pixel. Requires grabbing the neighboring pixels. */ FORCE_INLINE double single_pixel( double *im, int center, int top, int left, int right, int bottom, const __m256i mask1110, const __m256d rgb0W, const __m256d onehalf, const __m256d minustwelvehalf){ // double r = im[center]; // double g = im[center+1]; // double b = im[center+2]; // double r1 = im[top]; // double g1 = im[top+1]; // double b1 = im[top+2]; // double r2 = im[left]; // double g2 = im[left+1]; // double b2 = im[left+2]; // double r3 = im[right]; // double g3 = im[right+1]; // double b3 = im[right+2]; // double r4 = im[bottom]; // double g4 = im[bottom+1]; // double b4 = im[bottom+2]; __m256d c = _mm256_maskload_pd(&(im[center]),mask1110); __m256d c1 = _mm256_loadu_pd(&(im[top])); __m256d c2 = _mm256_loadu_pd(&(im[left])); __m256d c3 = _mm256_loadu_pd(&(im[right])); __m256d c4 = _mm256_loadu_pd(&(im[bottom])); COST_INC_LOAD(20); // double grey = rw * r + gw * g + bw * b; // double grey1 = rw * r1 + gw * g1 + bw * b1; // double grey2 = rw * r2 + gw * g2 + bw * b2; // double grey3 = rw * r3 + gw * g3 + bw * b3; // double grey4 = rw * r4 + gw * g4 + bw * b4; __m256d greyc = _mm256_mul_pd(c,rgb0W); __m256d grey1 = _mm256_mul_pd(c1,rgb0W); __m256d grey2 = _mm256_mul_pd(c2,rgb0W); __m256d grey3 = _mm256_mul_pd(c3,rgb0W); __m256d grey4 = _mm256_mul_pd(c4,rgb0W); //AVX: double: horizontal add for 1 vector __m256d c_perm = _mm256_permute2f128_pd(c, c, 0b00100001);//1,2 __m256d c_h = _mm256_hadd_pd(c,c_perm); __m128d c_h_lo = _mm256_extractf128_pd (c_h, 0);// lo __m128d c_h_hi = _mm256_extractf128_pd (c_h, 1);// hi double c_hsum_lo = _mm_cvtsd_f64(c_h_lo); double c_hsum_hi = _mm_cvtsd_f64(c_h_hi); double c_hsum = c_hsum_lo + c_hsum_hi; //AVX: double: horizontal add for 1 vector __m256d greyc_perm = _mm256_permute2f128_pd(greyc, greyc, 0b00100001);//1,2 __m256d greyc_h = _mm256_hadd_pd(greyc,greyc_perm); __m128d greyc_h_lo = _mm256_extractf128_pd (greyc_h, 0);// lo __m128d greyc_h_hi = _mm256_extractf128_pd (greyc_h, 1);// hi double greyc_hsum_lo = _mm_cvtsd_f64(greyc_h_lo); double greyc_hsum_hi = _mm_cvtsd_f64(greyc_h_hi); double greyc_hsum = greyc_hsum_lo + greyc_hsum_hi; //AVX: _m256d: horizontal add for 4 vectors at once __m256d grey12 = _mm256_hadd_pd(grey1,grey2); __m256d grey34 = _mm256_hadd_pd(grey3,grey4); __m256d grey_1234_blend = _mm256_blend_pd(grey12, grey34, 0b1100); //0011 __m256d grey_1234_perm = _mm256_permute2f128_pd(grey12, grey34, 0b00100001);//1,2 __m256d grey_1234 = _mm256_add_pd(grey_1234_perm, grey_1234_blend); //AVX: double: horizontal add for 1 vector __m256d grey1234_perm = _mm256_permute2f128_pd(grey_1234, grey_1234, 0b00100001);//1,2 __m256d grey1234_h = _mm256_hadd_pd(grey_1234,grey1234_perm); __m128d grey1234_h_lo = _mm256_extractf128_pd (grey1234_h, 0);// lo __m128d grey1234_h_hi = _mm256_extractf128_pd (grey1234_h, 1);// hi double grey1234_hsum_lo = _mm_cvtsd_f64(grey1234_h_lo); double grey1234_hsum_hi = _mm_cvtsd_f64(grey1234_h_hi); double grey1234_sum = grey1234_hsum_lo + grey1234_hsum_hi; COST_INC_ADD(10); //+ operations wasted on AVX COST_INC_MUL(15); //+ operations wasted on AVX double mu = c_hsum / 3.0; COST_INC_ADD(2); COST_INC_DIV(1); // double rmu = r-mu; // double gmu = g-mu; // double bmu = b-mu; __m256d c_mu = _mm256_set1_pd(mu); __m256d c_rgbmu = _mm256_sub_pd(c,c_mu); COST_INC_ADD(3); //+1 operations wasted on AVX // double rz = r-0.5; // double gz = g-0.5; // double bz = b-0.5; __m256d c_rgbz = _mm256_sub_pd(c,onehalf); COST_INC_ADD(3); //+1 operations wasted on AVX // double rzrz = rz*rz; // double gzgz = gz*gz; // double bzbz = bz*bz; __m256d c_rgbz_sq = _mm256_mul_pd(c_rgbz,c_rgbz); COST_INC_MUL(3); //+1 operations wasted on AVX // double re = exp(-12.5*rzrz); // double ge = exp(-12.5*gzgz); // double be = exp(-12.5*bzbz); __m256d c_rgbe_tmp = _mm256_mul_pd(minustwelvehalf,c_rgbz_sq); __m128 c_rgbe_tmp_ps = _mm256_cvtpd_ps(c_rgbe_tmp); __m128 c_rgbe_ps = exp_ps(c_rgbe_tmp_ps); __m256d c_rgbe = _mm256_cvtps_pd(c_rgbe_ps); COST_INC_EXP(3); COST_INC_MUL(3); //+1 operations wasted on AVX // double t1 = sqrt((rmu*rmu + gmu*gmu + bmu*bmu)/3.0); __m256d c_rgbmu_sq = _mm256_mul_pd(c_rgbmu,c_rgbmu); __m128d t1_tmp1_lo = _mm256_extractf128_pd (c_rgbmu_sq, 0);// lo __m128d t1_tmp1_hi = _mm256_extractf128_pd (c_rgbmu_sq, 1);// hi __m128d t1_tmp1_lo_sum = _mm_hadd_pd (t1_tmp1_lo, t1_tmp1_lo); double t1_tmp1_hi_lo = _mm_cvtsd_f64(t1_tmp1_hi); double t1_tmp1_lo_sum_lo = _mm_cvtsd_f64(t1_tmp1_lo_sum); double t1_tmp1 = t1_tmp1_lo_sum_lo + t1_tmp1_hi_lo; double t1_tmp2 = t1_tmp1 / 3.0; double t1 = sqrt(t1_tmp2); COST_INC_SQRT(1); COST_INC_ADD(3); COST_INC_MUL(3); //+1 operations wasted on AVX COST_INC_DIV(1); double t2 = fabs(t1); COST_INC_ABS(1); // double t3 = re*ge*be; __m128d t3_tmp1_lo = _mm256_extractf128_pd (c_rgbe, 0);// lo __m128d t3_tmp1_hi = _mm256_extractf128_pd (c_rgbe, 1);// hi double t3_tmp1_lo_lo = _mm_cvtsd_f64(t3_tmp1_lo); double t3_tmp1_hi_lo = _mm_cvtsd_f64(t3_tmp1_hi); __m128d t3_tmp1_lo_swapped = _mm_permute_pd(t3_tmp1_lo, 1);// swap double t3_tmp1_lo_hi = _mm_cvtsd_f64(t3_tmp1_lo_swapped); double t3 = t3_tmp1_lo_lo * t3_tmp1_lo_hi * t3_tmp1_hi_lo; COST_INC_MUL(2); double t4 = fabs(t3); COST_INC_ABS(1); double t5 = t2 * t4; COST_INC_MUL(1); // double t6 = -4.0*grey+grey1+grey2+grey3+grey4; double minusfour_times_grey = -4.0*greyc_hsum; double t6 = minusfour_times_grey+grey1234_sum; COST_INC_MUL(1); COST_INC_ADD(2); //2 operations saved due to AVX double t7 = fabs(t6); COST_INC_ABS(1); double t8 = t5 * t7; COST_INC_MUL(1); double t9 = t8 + 1.0E-12; COST_INC_ADD(1); return t9; }
inline __m128 _mm_pow_ps(__m128 a, __m128 b) { return exp_ps(_mm_mul_ps(b, log_ps(a))); }