コード例 #1
0
void check_special_values() {
  V4SF vx;
  vx.f[0] = -1000;
  vx.f[1] = -100;
  vx.f[2] = 100;
  vx.f[3] = 1000;
  printf("exp("); print4(vx.v); printf(") = "); print4(exp_ps(vx.v)); printf("\n");
  vx.f[0] = QNAN.f;
  vx.f[1] = PINF.f;
  vx.f[2] = MINF.f;
  vx.f[3] = QNAN2.f;
  printf("exp("); print4(vx.v); printf(") = "); print4(exp_ps(vx.v)); printf("\n");
  vx.f[0] = 0;
  vx.f[1] = -10;
  vx.f[2] = 1e30f;
  vx.f[3] = 1e-42f;
  printf("log("); print4(vx.v); printf(") = "); print4(log_ps(vx.v)); printf("\n");
  vx.f[0] = QNAN.f;
  vx.f[1] = PINF.f;
  vx.f[2] = MINF.f;
  vx.f[3] = QNAN2.f;
  printf("log("); print4(vx.v); printf(") = "); print4(log_ps(vx.v)); printf("\n");
  printf("sin("); print4(vx.v); printf(") = "); print4(sin_ps(vx.v)); printf("\n");
  printf("cos("); print4(vx.v); printf(") = "); print4(cos_ps(vx.v)); printf("\n");
  vx.f[0] = -1e30;
  vx.f[1] = -100000;
  vx.f[2] = 1e30;
  vx.f[3] = 100000;
  printf("sin("); print4(vx.v); printf(") = "); print4(sin_ps(vx.v)); printf("\n");
  printf("cos("); print4(vx.v); printf(") = "); print4(cos_ps(vx.v)); printf("\n");
}
コード例 #2
0
ファイル: rosesimd.c プロジェクト: peihunglin/TSVC_benchmark
// SIMD exp
__SIMD _SIMD_exp_ps(__SIMD a)
{
#ifdef  USE_SSE
  return exp_ps(a); 
#elif defined USE_AVX
  return exp256_ps(a); 
#endif
}
コード例 #3
0
ファイル: main.cpp プロジェクト: KubaO/stackoverflown
void fast(element_t * const elements, const int num_elts, const float a) {
    element_t * elts = elements;
    float logf_a = logf(a);
    float logf_1_a = logf(1.0/a);
    v4sf log_a = _mm_load1_ps(&logf_a);
    v4sf log_1_a = _mm_load1_ps(&logf_1_a);
    assert(num_elts % 3 == 0); // operates on 3 elements at a time

    // elts->re = powf((powf(elts->x, a) + powf(elts->y, a) + powf(elts->z, a)), 1.0/a);
    for (int i = 0; i < num_elts; i += 3) {
        // transpose
        // we save one operation over _MM_TRANSPOSE4_PS by skipping the last row of output
        v4sf r0 = _mm_load_ps(&elts[0].x); // x1,y1,z1,0
        v4sf r1 = _mm_load_ps(&elts[1].x); // x2,y2,z2,0
        v4sf r2 = _mm_load_ps(&elts[2].x); // x3,y3,z3,0
        v4sf r3 = _mm_setzero_ps();        // 0, 0, 0, 0
        v4sf t0 = _mm_unpacklo_ps(r0, r1); //  x1,x2,y1,y2
        v4sf t1 = _mm_unpacklo_ps(r2, r3); //  x3,0, y3,0
        v4sf t2 = _mm_unpackhi_ps(r0, r1); //  z1,z2,0, 0
        v4sf t3 = _mm_unpackhi_ps(r2, r3); //  z3,0, 0, 0
        r0 = _mm_movelh_ps(t0, t1);        // x1,x2,x3,0
        r1 = _mm_movehl_ps(t1, t0);        // y1,y2,y3,0
        r2 = _mm_movelh_ps(t2, t3);        // z1,z2,z3,0
        // perform pow(x,a),.. using the fact that pow(x,a) = exp(x * log(a))
        v4sf r0a = _mm_mul_ps(r0, log_a); // x1*log(a), x2*log(a), x3*log(a), 0
        v4sf r1a = _mm_mul_ps(r1, log_a); // y1*log(a), y2*log(a), y3*log(a), 0
        v4sf r2a = _mm_mul_ps(r2, log_a); // z1*log(a), z2*log(a), z3*log(a), 0
        v4sf ex0 = exp_ps(r0a); // pow(x1, a), ..., 0
        v4sf ex1 = exp_ps(r1a); // pow(y1, a), ..., 0
        v4sf ex2 = exp_ps(r2a); // pow(z1, a), ..., 0
        // sum
        v4sf s1 = _mm_add_ps(ex0, ex1);
        v4sf s2 = _mm_add_ps(sum1, ex2);
        // pow(sum, 1/a) = exp(sum * log(1/a))
        v4sf ps = _mm_mul_ps(s2, log_1_a);
        v4sf es = exp_ps(ps);
        ALIGN16_BEG float re[4] ALIGN16_END;
        _mm_store_ps(re, es);
        elts[0].re = re[0];
        elts[1].re = re[1];
        elts[2].re = re[2];
        elts += 3;
    }
}
コード例 #4
0
int check_explog_precision(float xmin, float xmax) {
  unsigned nb_trials = 100000;
  printf("checking exp/log [%g, %g]\n", xmin, xmax);

  float max_err_exp_ref = 0, max_err_exp_cep = 0, max_err_exp_x = 0;
  float max_err_log_ref = 0, max_err_log_cep = 0, max_err_log_x = 0;
  float max_err_logexp_test = 0;
  float max_err_logexp_ref = 0;
  unsigned i;
  for (i=0; i < nb_trials; ++i) {
    V4SF vx, exp4, log4;
    vx.f[0] = frand()*(xmax-xmin)+xmin;
    vx.f[1] = frand()*(xmax-xmin)+xmin;
    vx.f[2] = frand()*(xmax-xmin)+xmin;
    vx.f[3] = frand()*(xmax-xmin)+xmin;
    exp4.v = exp_ps(vx.v);
    log4.v = log_ps(exp4.v);
    unsigned j;
    for (j=0; j < 4; ++j) {
      float x = vx.f[j];
      float exp_test = exp4.f[j];
      float log_test = log4.f[j];
      float exp_ref = expf(x);
      float exp_cep = cephes_expf(x);
      float err_exp_ref = fabs(exp_ref - exp_test)/exp_ref;
      float err_exp_cep = fabs(exp_cep - exp_test)/exp_ref;
      if (err_exp_ref > max_err_exp_ref) {
        max_err_exp_ref = err_exp_ref;
        max_err_exp_x = x;
      }
      max_err_exp_cep = MAX(max_err_exp_cep, err_exp_cep);

      float log_ref = logf(exp_test);
      float log_cep = cephes_logf(exp_test);
      float err_log_ref = fabs(log_ref - log_test);
      float err_log_cep = fabs(log_cep - log_test);
      if (err_log_ref > max_err_log_ref) {
        max_err_log_ref = err_log_ref;
        max_err_log_x = x;
      }
      max_err_log_cep = MAX(max_err_log_cep, err_log_cep);
      float err_logexp_test = fabs(x - log_test);
      float err_logexp_ref = fabs(x - logf(expf(x)));
      max_err_logexp_ref = MAX(max_err_logexp_ref, err_logexp_ref);
      max_err_logexp_test = MAX(max_err_logexp_test, err_logexp_test);
    }
  }
  printf("max (relative) deviation from expf(x): %g at %14.12g, max deviation from cephes_expf(x): %g\n",
         max_err_exp_ref, max_err_exp_x, max_err_exp_cep);
  printf("max (absolute) deviation from logf(x): %g at %14.12g, max deviation from cephes_logf(x): %g\n",
         max_err_log_ref, max_err_log_x, max_err_log_cep);

  printf("deviation of x - log(exp(x)): %g (ref deviation is %g)\n",
         max_err_logexp_test, max_err_logexp_ref);

  if (max_err_logexp_test < 2e-7 && max_err_exp_ref < 2e-7 && max_err_log_ref < 2e-7) {
    printf("   ->> precision OK for the exp_ps / log_ps <<-\n\n");
    return 0;
  } else {
    printf("\n   WRONG PRECISION !! there is a problem\n\n");
    return 1;
  }
}
コード例 #5
0
/**
  * Calculate all values in one step per pixel. Requires grabbing the neighboring pixels.
  */
FORCE_INLINE double single_pixel(
        double *im, int center, int top, int left, int right, int bottom,
        const __m256i mask1110,
        const __m256d rgb0W,
        const __m256d onehalf,
        const __m256d minustwelvehalf){
//    double r = im[center];
//    double g = im[center+1];
//    double b = im[center+2];

//    double r1 = im[top];
//    double g1 = im[top+1];
//    double b1 = im[top+2];
//    double r2 = im[left];
//    double g2 = im[left+1];
//    double b2 = im[left+2];
//    double r3 = im[right];
//    double g3 = im[right+1];
//    double b3 = im[right+2];
//    double r4 = im[bottom];
//    double g4 = im[bottom+1];
//    double b4 = im[bottom+2];

    __m256d c = _mm256_maskload_pd(&(im[center]),mask1110);
    __m256d c1 = _mm256_loadu_pd(&(im[top]));
    __m256d c2 = _mm256_loadu_pd(&(im[left]));
    __m256d c3 = _mm256_loadu_pd(&(im[right]));
    __m256d c4 = _mm256_loadu_pd(&(im[bottom]));

    COST_INC_LOAD(20);

//    double grey = rw * r + gw * g + bw * b;
//    double grey1 = rw * r1 + gw * g1 + bw * b1;
//    double grey2 = rw * r2 + gw * g2 + bw * b2;
//    double grey3 = rw * r3 + gw * g3 + bw * b3;
//    double grey4 = rw * r4 + gw * g4 + bw * b4;

    __m256d greyc = _mm256_mul_pd(c,rgb0W);
    __m256d grey1 = _mm256_mul_pd(c1,rgb0W);
    __m256d grey2 = _mm256_mul_pd(c2,rgb0W);
    __m256d grey3 = _mm256_mul_pd(c3,rgb0W);
    __m256d grey4 = _mm256_mul_pd(c4,rgb0W);

    //AVX: double: horizontal add for 1 vector
     __m256d c_perm = _mm256_permute2f128_pd(c, c, 0b00100001);//1,2
     __m256d c_h = _mm256_hadd_pd(c,c_perm);
     __m128d c_h_lo = _mm256_extractf128_pd (c_h, 0);// lo
     __m128d c_h_hi = _mm256_extractf128_pd (c_h, 1);// hi
     double c_hsum_lo = _mm_cvtsd_f64(c_h_lo);
     double c_hsum_hi = _mm_cvtsd_f64(c_h_hi);
     double c_hsum = c_hsum_lo + c_hsum_hi;

     //AVX: double: horizontal add for 1 vector
      __m256d greyc_perm = _mm256_permute2f128_pd(greyc, greyc, 0b00100001);//1,2
      __m256d greyc_h = _mm256_hadd_pd(greyc,greyc_perm);
      __m128d greyc_h_lo = _mm256_extractf128_pd (greyc_h, 0);// lo
      __m128d greyc_h_hi = _mm256_extractf128_pd (greyc_h, 1);// hi
      double greyc_hsum_lo = _mm_cvtsd_f64(greyc_h_lo);
      double greyc_hsum_hi = _mm_cvtsd_f64(greyc_h_hi);
      double greyc_hsum = greyc_hsum_lo + greyc_hsum_hi;

    //AVX: _m256d: horizontal add for 4 vectors at once
    __m256d grey12 = _mm256_hadd_pd(grey1,grey2);
    __m256d grey34 = _mm256_hadd_pd(grey3,grey4);
    __m256d grey_1234_blend = _mm256_blend_pd(grey12, grey34, 0b1100); //0011
    __m256d grey_1234_perm = _mm256_permute2f128_pd(grey12, grey34, 0b00100001);//1,2
    __m256d grey_1234 =  _mm256_add_pd(grey_1234_perm, grey_1234_blend);

    //AVX: double: horizontal add for 1 vector
     __m256d grey1234_perm = _mm256_permute2f128_pd(grey_1234, grey_1234, 0b00100001);//1,2
     __m256d grey1234_h = _mm256_hadd_pd(grey_1234,grey1234_perm);
     __m128d grey1234_h_lo = _mm256_extractf128_pd (grey1234_h, 0);// lo
     __m128d grey1234_h_hi = _mm256_extractf128_pd (grey1234_h, 1);// hi
     double grey1234_hsum_lo = _mm_cvtsd_f64(grey1234_h_lo);
     double grey1234_hsum_hi = _mm_cvtsd_f64(grey1234_h_hi);
     double grey1234_sum = grey1234_hsum_lo + grey1234_hsum_hi;

    COST_INC_ADD(10); //+ operations wasted on AVX
    COST_INC_MUL(15); //+ operations wasted on AVX

    double mu = c_hsum / 3.0;
    COST_INC_ADD(2);
    COST_INC_DIV(1);

//    double rmu = r-mu;
//    double gmu = g-mu;
//    double bmu = b-mu;

    __m256d c_mu = _mm256_set1_pd(mu);
    __m256d c_rgbmu = _mm256_sub_pd(c,c_mu);
    COST_INC_ADD(3); //+1 operations wasted on AVX

//    double rz = r-0.5;
//    double gz = g-0.5;
//    double bz = b-0.5;

    __m256d c_rgbz = _mm256_sub_pd(c,onehalf);
    COST_INC_ADD(3); //+1 operations wasted on AVX

//    double rzrz = rz*rz;
//    double gzgz = gz*gz;
//    double bzbz = bz*bz;

    __m256d c_rgbz_sq = _mm256_mul_pd(c_rgbz,c_rgbz);
    COST_INC_MUL(3); //+1 operations wasted on AVX

//    double re = exp(-12.5*rzrz);
//    double ge = exp(-12.5*gzgz);
//    double be = exp(-12.5*bzbz);

    __m256d c_rgbe_tmp = _mm256_mul_pd(minustwelvehalf,c_rgbz_sq);

    __m128 c_rgbe_tmp_ps = _mm256_cvtpd_ps(c_rgbe_tmp);
    __m128 c_rgbe_ps = exp_ps(c_rgbe_tmp_ps);
    __m256d c_rgbe = _mm256_cvtps_pd(c_rgbe_ps);

    COST_INC_EXP(3);
    COST_INC_MUL(3); //+1 operations wasted on AVX

//    double t1 = sqrt((rmu*rmu + gmu*gmu + bmu*bmu)/3.0);
    __m256d c_rgbmu_sq = _mm256_mul_pd(c_rgbmu,c_rgbmu);

    __m128d t1_tmp1_lo = _mm256_extractf128_pd (c_rgbmu_sq, 0);// lo
    __m128d t1_tmp1_hi = _mm256_extractf128_pd (c_rgbmu_sq, 1);// hi
    __m128d t1_tmp1_lo_sum = _mm_hadd_pd (t1_tmp1_lo, t1_tmp1_lo);
    double t1_tmp1_hi_lo = _mm_cvtsd_f64(t1_tmp1_hi);
    double t1_tmp1_lo_sum_lo = _mm_cvtsd_f64(t1_tmp1_lo_sum);

    double t1_tmp1 = t1_tmp1_lo_sum_lo + t1_tmp1_hi_lo;

    double t1_tmp2 = t1_tmp1 / 3.0;
    double t1 = sqrt(t1_tmp2);

    COST_INC_SQRT(1);
    COST_INC_ADD(3);
    COST_INC_MUL(3); //+1 operations wasted on AVX
    COST_INC_DIV(1);
    double t2 = fabs(t1);
    COST_INC_ABS(1);

//    double t3 = re*ge*be;

    __m128d t3_tmp1_lo = _mm256_extractf128_pd (c_rgbe, 0);// lo
    __m128d t3_tmp1_hi = _mm256_extractf128_pd (c_rgbe, 1);// hi

    double t3_tmp1_lo_lo = _mm_cvtsd_f64(t3_tmp1_lo);
    double t3_tmp1_hi_lo = _mm_cvtsd_f64(t3_tmp1_hi);
    __m128d t3_tmp1_lo_swapped = _mm_permute_pd(t3_tmp1_lo, 1);// swap
    double t3_tmp1_lo_hi = _mm_cvtsd_f64(t3_tmp1_lo_swapped);

    double t3 = t3_tmp1_lo_lo * t3_tmp1_lo_hi * t3_tmp1_hi_lo;

    COST_INC_MUL(2);
    double t4 = fabs(t3);
    COST_INC_ABS(1);

    double t5 = t2 * t4;
    COST_INC_MUL(1);

//    double t6 = -4.0*grey+grey1+grey2+grey3+grey4;

    double minusfour_times_grey = -4.0*greyc_hsum;
    double t6 = minusfour_times_grey+grey1234_sum;

    COST_INC_MUL(1);
    COST_INC_ADD(2); //2 operations saved due to AVX

    double t7 = fabs(t6);
    COST_INC_ABS(1);

    double t8 = t5 * t7;
    COST_INC_MUL(1);

    double t9 = t8 + 1.0E-12;
    COST_INC_ADD(1);

    return t9;
}
コード例 #6
0
ファイル: arithmetic.cpp プロジェクト: ArtisticCoding/OpenCP
inline __m128 _mm_pow_ps(__m128 a, __m128 b)
{
    return exp_ps(_mm_mul_ps(b, log_ps(a)));
}