inline
    double operator[](int i) const
    {
        __m128d buf0;
        if (i < 4) {
            if (i < 2) {
                buf0 = _mm512_extractf64x2_pd(val, 0);
            } else {
                buf0 = _mm512_extractf64x2_pd(val, 1);
            }
        } else {
            if (i < 6) {
                buf0 = _mm512_extractf64x2_pd(val, 2);
            } else {
                buf0 = _mm512_extractf64x2_pd(val, 3);
            }
        }

        i &= 1;

        if (i == 0) {
            return _mm_cvtsd_f64(buf0);
        }

        buf0 = _mm_shuffle_pd(buf0, buf0, 1);
        return _mm_cvtsd_f64(buf0);
    }
Esempio n. 2
0
double test_mm_cvtsd_f64(__m128d A) {
  // DAG-LABEL: test_mm_cvtsd_f64
  // DAG: extractelement <2 x double> %{{.*}}, i32 0
  //
  // ASM-LABEL: test_mm_cvtsd_f64
  // ASM: movapd
  return _mm_cvtsd_f64(A);
}
Esempio n. 3
0
xsqrt( double x )
{
  __v2df f, g;
  double _d;
  x += x;
  g = __extension__ (__v2df){ x, 0 };
  f = _mm_sqrt_pd( g );
  _d = _mm_cvtsd_f64 (f);
  return (_d);
}
Esempio n. 4
0
 double hadd(const vector4d& rhs)
 {
     // rhs = (x0, x1, x2, x3)
     // tmp = (x2, x3, x0, x1)
     __m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1);
     // tmp = (x2+x0, x3+x1, -, -)
     tmp = _mm256_add_pd(rhs, tmp);
     // tmp = (x2+x0+x3+x1, -, -, -)
     tmp = _mm256_hadd_pd(tmp, tmp);
     return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0));
 }
Esempio n. 5
0
irreg_poly_area_func_sign(double, _avx) {
    if (__builtin_expect(is_null(cords) || cords_len == 0, 0))
        return 0;

    __m256d
        curr,
        forw,
        coef_0,
        coef_1,
        end = _mm256_load_pd((const double *)cords),
        accum_sum = _mm256_setzero_pd();
    double accum_sum_aux;

    unsigned long index;
    for (index = 0; index < (cords_len - 4); index += 4) {
        curr = end;                                                 // x0,y0,x1,y1
        forw = _mm256_load_pd((const double *)&cords[index + 2]);   // x2,y2,x3,y3
        end = _mm256_load_pd((const double *)&cords[index + 4]);    // x4,y4,x5,y5

        coef_0 = _mm256_permute2f128_pd(curr, forw, 0b00110001); // x1, y1, x3, y3
        coef_1 = _mm256_permute2f128_pd(forw, end, 0b00100000); // x2, y2, x4, y4

        //_mm256_hsub_pd(a, b) == a0 - a1, b0 - b1, a2 - a3, b2 - b3
        accum_sum = _mm256_add_pd(
            accum_sum,
            _mm256_hsub_pd( // x0*y1 - y0*x1, x1*y2 - y1x2, x2*y3 - y2*x3, x3*y4 - y3*x4
                _mm256_mul_pd( // x0*y1, y0*x1, x2*y3, y2*x3
                    _mm256_permute2f128_pd(curr, forw, 0b00100000),  // x0, y0, x2, y2
                    _mm256_shuffle_pd(coef_0, coef_0, 0b0101)  // y1, x1, y3, x3
                ),
                _mm256_mul_pd(coef_0, _mm256_shuffle_pd(coef_1, coef_1, 0b0101)) // y2, x2, y4, x4
                // ^^^^^^^^^^^^^^^  x1*y2, y1*x2, x3*y4, y3*x4
            )
        );
    }

    accum_sum = _mm256_hadd_pd(accum_sum, _mm256_permute2f128_pd(accum_sum, accum_sum, 1)); // a0+a1, a2+a3, a2+a3, a0+a1
    accum_sum = _mm256_hadd_pd(accum_sum, accum_sum); // a0+a1+a2+a3, ...
    for (accum_sum_aux = _mm_cvtsd_f64(_mm256_castpd256_pd128(accum_sum)); index < (cords_len - 1); index++)
        accum_sum_aux += _calc_diff_of_adj_prods(cords, index);

    return accum_sum_aux;
//    return scalar_half(scalar_abs(accum_sum_aux));
}
Esempio n. 6
0
 /*!
  * \brief Perform an horizontal sum of the given vector.
  * \param in The input vector type
  * \return the horizontal sum of the vector
  */
 ETL_STATIC_INLINE(double) hadd(avx_simd_double in) {
     const __m256d t1 = _mm256_hadd_pd(in.value, _mm256_permute2f128_pd(in.value, in.value, 1));
     const __m256d t2 = _mm256_hadd_pd(t1, t1);
     return _mm_cvtsd_f64(_mm256_castpd256_pd128(t2));
 }
Esempio n. 7
0
void AVX2FMA3DNoise(Vector3d& result, const Vector3d& EPoint)
{

#if CHECK_FUNCTIONAL
    Vector3d param(EPoint);
#endif

    AVX2TABLETYPE *mp;

    // TODO FIXME - global statistics reference
    // Stats[Calls_To_DNoise]++;

    const __m256d ONE_PD = _mm256_set1_pd(1.0);
    const __m128i short_si128 = _mm_set1_epi32(0xffff);

    const __m256d xyzn = _mm256_setr_pd(EPoint[X], EPoint[Y], EPoint[Z], 0);
    const __m256d epsy = _mm256_set1_pd(1.0 - EPSILON);
    const __m256d xyzn_e = _mm256_sub_pd(xyzn, epsy);
    const __m128i tmp_xyzn = _mm256_cvttpd_epi32(_mm256_blendv_pd(xyzn, xyzn_e, xyzn));

    const __m128i noise_min_xyzn = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, NOISE_MINZ, 0);

    const __m256d xyz_ixyzn = _mm256_sub_pd(xyzn, _mm256_cvtepi32_pd(tmp_xyzn));
    const __m256d xyz_jxyzn = _mm256_sub_pd(xyz_ixyzn, ONE_PD);

    const __m128i i_xyzn = _mm_and_si128(_mm_sub_epi32(tmp_xyzn, noise_min_xyzn),
        _mm_set1_epi32(0xfff));

    const __m256d s_xyzn = _mm256_mul_pd(xyz_ixyzn,
        _mm256_mul_pd(xyz_ixyzn,
            _mm256_sub_pd(_mm256_set1_pd(3.0),
                _mm256_add_pd(xyz_ixyzn, xyz_ixyzn))));

    const __m256d t_xyzn = _mm256_sub_pd(ONE_PD, s_xyzn);

    const __m256d txtysxsy = _mm256_permute2f128_pd(t_xyzn, s_xyzn, 0x20);
    const __m256d txsxtxsx = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(2, 0, 2, 0));
    const __m256d tytysysy = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(3, 3, 1, 1));

    const __m256d txtysxtytxsysxsy = _mm256_mul_pd(txsxtxsx, tytysysy);

    const __m256d incrsump_s1 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(t_xyzn, _MM_SHUFFLE(2, 2, 2, 2)));
    const __m256d incrsump_s2 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(s_xyzn, _MM_SHUFFLE(2, 2, 2, 2)));

    int ints[4];
    _mm_storeu_si128((__m128i*)(ints), i_xyzn);

    const int ixiy_hash = Hash2d(ints[0], ints[1]);
    const int jxiy_hash = Hash2d(ints[0] + 1, ints[1]);
    const int ixjy_hash = Hash2d(ints[0], ints[1] + 1);
    const int jxjy_hash = Hash2d(ints[0] + 1, ints[1] + 1);

    const int iz = ints[2];

    const __m256d iii = _mm256_blend_pd(PERMUTE4x64(xyz_ixyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1);
    const __m256d jjj = _mm256_blend_pd(PERMUTE4x64(xyz_jxyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1);

    __m256d ss;
    __m256d blend;

    __m256d x = _mm256_setzero_pd(), y = _mm256_setzero_pd(), z = _mm256_setzero_pd();


    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz)];
    ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(0, 0, 0, 0));
    //     blend = _mm256_blend_pd(iii, jjj, 0);

    INCSUMAVX_VECTOR(mp, ss, iii);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz)];
    ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(1, 1, 1, 1));
    blend = _mm256_blend_pd(iii, jjj, 2);

    INCSUMAVX_VECTOR(mp, ss, blend);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz)];
    ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(3, 3, 3, 3));
    blend = _mm256_blend_pd(iii, jjj, 6);

    INCSUMAVX_VECTOR(mp, ss, blend);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz)];
    ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(2, 2, 2, 2));
    blend = _mm256_blend_pd(iii, jjj, 4);

    INCSUMAVX_VECTOR(mp, ss, blend);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz + 1)];
    ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(2, 2, 2, 2));
    blend = _mm256_blend_pd(iii, jjj, 12);

    INCSUMAVX_VECTOR(mp, ss, blend);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz + 1)];
    ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(3, 3, 3, 3));
    //     blend = _mm256_blend_pd(iii, jjj, 14);

    INCSUMAVX_VECTOR(mp, ss, jjj);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz + 1)];
    ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(1, 1, 1, 1));
    blend = _mm256_blend_pd(iii, jjj, 10);

    INCSUMAVX_VECTOR(mp, ss, blend);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz + 1)];
    ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(0, 0, 0, 0));
    blend = _mm256_blend_pd(iii, jjj, 8);

    INCSUMAVX_VECTOR(mp, ss, blend);


    __m256d xy = _mm256_hadd_pd(x,y);
    __m128d xy_up = _mm256_extractf128_pd(xy,1);
    xy_up = _mm_add_pd(_mm256_castpd256_pd128(xy),xy_up);
    _mm_storeu_pd(&result[X],xy_up);

    __m128d z_up = _mm256_extractf128_pd(z,1);
    z_up = _mm_add_pd(_mm256_castpd256_pd128(z),z_up);
    z_up = _mm_hadd_pd(z_up,z_up);
    result[Z] = _mm_cvtsd_f64(z_up);


#if CHECK_FUNCTIONAL
    {
        Vector3d portable_res;
        PortableDNoise(portable_res , param);
        if (fabs(portable_res[X] - result[X]) >= EPSILON)
        {
            throw POV_EXCEPTION_STRING("DNoise X error");
        }
        if (fabs(portable_res[Y] - result[Y]) >= EPSILON)
        {
            throw POV_EXCEPTION_STRING("DNoise Y error");
        }
        if (fabs(portable_res[Z] - result[Z]) >= EPSILON)
        {
            throw POV_EXCEPTION_STRING("DNoise Z error");
        }

    }

#endif



    _mm256_zeroupper();
    return;

}
Esempio n. 8
0
DBL AVX2FMA3Noise(const Vector3d& EPoint, int noise_generator)
{
    AVX2TABLETYPE *mp;
    DBL sum = 0.0;

    // TODO FIXME - global statistics reference
    // Stats[Calls_To_Noise]++;

    if (noise_generator == kNoiseGen_Perlin)
    {
        // The 1.59 and 0.985 are to correct for some biasing problems with
        // the random # generator used to create the noise tables.  Final
        // range of values is about 5.0e-4 below 0.0 and above 1.0.  Mean
        // value is 0.49 (ideally it would be 0.5).
        sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985);

        // Clamp final value to 0-1 range
        if (sum < 0.0) sum = 0.0;
        if (sum > 1.0) sum = 1.0;

        return sum;
    }

    const __m256d ONE_PD = _mm256_set1_pd(1);
    const __m128i short_si128 = _mm_set1_epi32(0xffff);

    const __m256d xyzn = _mm256_setr_pd(EPoint[X], EPoint[Y], EPoint[Z], 0);
    const __m256d epsy = _mm256_set1_pd(1.0 - EPSILON);
    const __m256d xyzn_e = _mm256_sub_pd(xyzn, epsy);
    const __m128i tmp_xyzn = _mm256_cvttpd_epi32(_mm256_blendv_pd(xyzn, xyzn_e, xyzn));

    const __m128i noise_min_xyzn = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, NOISE_MINZ, 0);

    const __m256d xyz_ixyzn = _mm256_sub_pd(xyzn, _mm256_cvtepi32_pd(tmp_xyzn));
    const __m256d xyz_jxyzn = _mm256_sub_pd(xyz_ixyzn, ONE_PD);

    const __m128i i_xyzn = _mm_and_si128(_mm_sub_epi32(tmp_xyzn, noise_min_xyzn),
        _mm_set1_epi32(0xfff));

    const __m256d s_xyzn = _mm256_mul_pd(xyz_ixyzn,
        _mm256_mul_pd(xyz_ixyzn,
            _mm256_sub_pd(_mm256_set1_pd(3.0),
                _mm256_add_pd(xyz_ixyzn, xyz_ixyzn))));

    const __m256d t_xyzn = _mm256_sub_pd(ONE_PD, s_xyzn);

    const __m256d txtysxsy = _mm256_permute2f128_pd(t_xyzn, s_xyzn, 0x20);
    const __m256d txsxtxsx = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(2, 0, 2, 0));
    const __m256d tytysysy = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(3, 3, 1, 1));

    const __m256d txtysxtytxsysxsy = _mm256_mul_pd(txsxtxsx, tytysysy);

    const __m256d incrsump_s1 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(t_xyzn, _MM_SHUFFLE(2, 2, 2, 2)));
    const __m256d incrsump_s2 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(s_xyzn, _MM_SHUFFLE(2, 2, 2, 2)));

    int ints[4];
    _mm_storeu_si128((__m128i*)(ints), i_xyzn);

    const int ixiy_hash = Hash2d(ints[0], ints[1]);
    const int jxiy_hash = Hash2d(ints[0] + 1, ints[1]);
    const int ixjy_hash = Hash2d(ints[0], ints[1] + 1);
    const int jxjy_hash = Hash2d(ints[0] + 1, ints[1] + 1);

    const int iz = ints[2];

    const __m256d iii = _mm256_blend_pd(PERMUTE4x64(xyz_ixyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1);
    const __m256d jjj = _mm256_blend_pd(PERMUTE4x64(xyz_jxyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1);

    __m256d sumr = _mm256_setzero_pd();
    __m256d sumr1 = _mm256_setzero_pd();


    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz)];
    INCSUMAVX_NOBLEND(sumr, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(0, 0, 0, 0)), iii);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz)];
    INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(1, 1, 1, 1)), iii, jjj, 2);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz)];
    INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(2, 2, 2, 2)), iii, jjj, 4);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz)];
    INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(3, 3, 3, 3)), iii, jjj, 6);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz + 1)];
    INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(0, 0, 0, 0)), iii, jjj, 8);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz + 1)];
    INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(1, 1, 1, 1)), iii, jjj, 10);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz + 1)];
    INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(2, 2, 2, 2)), iii, jjj, 12);

    mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz + 1)];
    INCSUMAVX_NOBLEND(sumr1, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(3, 3, 3, 3)), jjj);

    {
        sumr = _mm256_add_pd(sumr, sumr1);

        __m128d sumr_up = _mm256_extractf128_pd(sumr,1);
        sumr_up = _mm_add_pd(_mm256_castpd256_pd128(sumr),sumr_up);
        sumr_up = _mm_hadd_pd(sumr_up,sumr_up);
        sum = _mm_cvtsd_f64(sumr_up);
    }

    if (noise_generator == kNoiseGen_RangeCorrected)
    {
        /* details of range here:
        Min, max: -1.05242, 0.988997
        Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828

        We want to change it to as close to [0,1] as possible.
        */
        sum += 1.05242;
        sum *= 0.48985582;
        /*sum *= 0.5;
        sum += 0.5;*/

        if (sum < 0.0)
            sum = 0.0;
        if (sum > 1.0)
            sum = 1.0;
    }
    else
    {
        sum = sum + 0.5;                     /* range at this point -0.5 - 0.5... */

        if (sum < 0.0)
            sum = 0.0;
        if (sum > 1.0)
            sum = 1.0;
    }



#if CHECK_FUNCTIONAL
    {
        DBL orig_sum = PortableNoise(EPoint, noise_generator);
        if (fabs(orig_sum - sum) >= EPSILON)
        {
            throw POV_EXCEPTION_STRING("Noise error");
        }

    }

#endif

    _mm256_zeroupper();
    return (sum);
}
Esempio n. 9
0
/**
  * Calculate all values in one step per pixel. Requires grabbing the neighboring pixels.
  */
FORCE_INLINE double single_pixel(
        double *im, int center, int top, int left, int right, int bottom,
        const __m256i mask1110,
        const __m256d rgb0W,
        const __m256d onehalf,
        const __m256d minustwelvehalf){
//    double r = im[center];
//    double g = im[center+1];
//    double b = im[center+2];

//    double r1 = im[top];
//    double g1 = im[top+1];
//    double b1 = im[top+2];
//    double r2 = im[left];
//    double g2 = im[left+1];
//    double b2 = im[left+2];
//    double r3 = im[right];
//    double g3 = im[right+1];
//    double b3 = im[right+2];
//    double r4 = im[bottom];
//    double g4 = im[bottom+1];
//    double b4 = im[bottom+2];

    __m256d c = _mm256_maskload_pd(&(im[center]),mask1110);
    __m256d c1 = _mm256_loadu_pd(&(im[top]));
    __m256d c2 = _mm256_loadu_pd(&(im[left]));
    __m256d c3 = _mm256_loadu_pd(&(im[right]));
    __m256d c4 = _mm256_loadu_pd(&(im[bottom]));

    COST_INC_LOAD(20);

//    double grey = rw * r + gw * g + bw * b;
//    double grey1 = rw * r1 + gw * g1 + bw * b1;
//    double grey2 = rw * r2 + gw * g2 + bw * b2;
//    double grey3 = rw * r3 + gw * g3 + bw * b3;
//    double grey4 = rw * r4 + gw * g4 + bw * b4;

    __m256d greyc = _mm256_mul_pd(c,rgb0W);
    __m256d grey1 = _mm256_mul_pd(c1,rgb0W);
    __m256d grey2 = _mm256_mul_pd(c2,rgb0W);
    __m256d grey3 = _mm256_mul_pd(c3,rgb0W);
    __m256d grey4 = _mm256_mul_pd(c4,rgb0W);

    //AVX: double: horizontal add for 1 vector
     __m256d c_perm = _mm256_permute2f128_pd(c, c, 0b00100001);//1,2
     __m256d c_h = _mm256_hadd_pd(c,c_perm);
     __m128d c_h_lo = _mm256_extractf128_pd (c_h, 0);// lo
     __m128d c_h_hi = _mm256_extractf128_pd (c_h, 1);// hi
     double c_hsum_lo = _mm_cvtsd_f64(c_h_lo);
     double c_hsum_hi = _mm_cvtsd_f64(c_h_hi);
     double c_hsum = c_hsum_lo + c_hsum_hi;

     //AVX: double: horizontal add for 1 vector
      __m256d greyc_perm = _mm256_permute2f128_pd(greyc, greyc, 0b00100001);//1,2
      __m256d greyc_h = _mm256_hadd_pd(greyc,greyc_perm);
      __m128d greyc_h_lo = _mm256_extractf128_pd (greyc_h, 0);// lo
      __m128d greyc_h_hi = _mm256_extractf128_pd (greyc_h, 1);// hi
      double greyc_hsum_lo = _mm_cvtsd_f64(greyc_h_lo);
      double greyc_hsum_hi = _mm_cvtsd_f64(greyc_h_hi);
      double greyc_hsum = greyc_hsum_lo + greyc_hsum_hi;

    //AVX: _m256d: horizontal add for 4 vectors at once
    __m256d grey12 = _mm256_hadd_pd(grey1,grey2);
    __m256d grey34 = _mm256_hadd_pd(grey3,grey4);
    __m256d grey_1234_blend = _mm256_blend_pd(grey12, grey34, 0b1100); //0011
    __m256d grey_1234_perm = _mm256_permute2f128_pd(grey12, grey34, 0b00100001);//1,2
    __m256d grey_1234 =  _mm256_add_pd(grey_1234_perm, grey_1234_blend);

    //AVX: double: horizontal add for 1 vector
     __m256d grey1234_perm = _mm256_permute2f128_pd(grey_1234, grey_1234, 0b00100001);//1,2
     __m256d grey1234_h = _mm256_hadd_pd(grey_1234,grey1234_perm);
     __m128d grey1234_h_lo = _mm256_extractf128_pd (grey1234_h, 0);// lo
     __m128d grey1234_h_hi = _mm256_extractf128_pd (grey1234_h, 1);// hi
     double grey1234_hsum_lo = _mm_cvtsd_f64(grey1234_h_lo);
     double grey1234_hsum_hi = _mm_cvtsd_f64(grey1234_h_hi);
     double grey1234_sum = grey1234_hsum_lo + grey1234_hsum_hi;

    COST_INC_ADD(10); //+ operations wasted on AVX
    COST_INC_MUL(15); //+ operations wasted on AVX

    double mu = c_hsum / 3.0;
    COST_INC_ADD(2);
    COST_INC_DIV(1);

//    double rmu = r-mu;
//    double gmu = g-mu;
//    double bmu = b-mu;

    __m256d c_mu = _mm256_set1_pd(mu);
    __m256d c_rgbmu = _mm256_sub_pd(c,c_mu);
    COST_INC_ADD(3); //+1 operations wasted on AVX

//    double rz = r-0.5;
//    double gz = g-0.5;
//    double bz = b-0.5;

    __m256d c_rgbz = _mm256_sub_pd(c,onehalf);
    COST_INC_ADD(3); //+1 operations wasted on AVX

//    double rzrz = rz*rz;
//    double gzgz = gz*gz;
//    double bzbz = bz*bz;

    __m256d c_rgbz_sq = _mm256_mul_pd(c_rgbz,c_rgbz);
    COST_INC_MUL(3); //+1 operations wasted on AVX

//    double re = exp(-12.5*rzrz);
//    double ge = exp(-12.5*gzgz);
//    double be = exp(-12.5*bzbz);

    __m256d c_rgbe_tmp = _mm256_mul_pd(minustwelvehalf,c_rgbz_sq);

    __m128 c_rgbe_tmp_ps = _mm256_cvtpd_ps(c_rgbe_tmp);
    __m128 c_rgbe_ps = exp_ps(c_rgbe_tmp_ps);
    __m256d c_rgbe = _mm256_cvtps_pd(c_rgbe_ps);

    COST_INC_EXP(3);
    COST_INC_MUL(3); //+1 operations wasted on AVX

//    double t1 = sqrt((rmu*rmu + gmu*gmu + bmu*bmu)/3.0);
    __m256d c_rgbmu_sq = _mm256_mul_pd(c_rgbmu,c_rgbmu);

    __m128d t1_tmp1_lo = _mm256_extractf128_pd (c_rgbmu_sq, 0);// lo
    __m128d t1_tmp1_hi = _mm256_extractf128_pd (c_rgbmu_sq, 1);// hi
    __m128d t1_tmp1_lo_sum = _mm_hadd_pd (t1_tmp1_lo, t1_tmp1_lo);
    double t1_tmp1_hi_lo = _mm_cvtsd_f64(t1_tmp1_hi);
    double t1_tmp1_lo_sum_lo = _mm_cvtsd_f64(t1_tmp1_lo_sum);

    double t1_tmp1 = t1_tmp1_lo_sum_lo + t1_tmp1_hi_lo;

    double t1_tmp2 = t1_tmp1 / 3.0;
    double t1 = sqrt(t1_tmp2);

    COST_INC_SQRT(1);
    COST_INC_ADD(3);
    COST_INC_MUL(3); //+1 operations wasted on AVX
    COST_INC_DIV(1);
    double t2 = fabs(t1);
    COST_INC_ABS(1);

//    double t3 = re*ge*be;

    __m128d t3_tmp1_lo = _mm256_extractf128_pd (c_rgbe, 0);// lo
    __m128d t3_tmp1_hi = _mm256_extractf128_pd (c_rgbe, 1);// hi

    double t3_tmp1_lo_lo = _mm_cvtsd_f64(t3_tmp1_lo);
    double t3_tmp1_hi_lo = _mm_cvtsd_f64(t3_tmp1_hi);
    __m128d t3_tmp1_lo_swapped = _mm_permute_pd(t3_tmp1_lo, 1);// swap
    double t3_tmp1_lo_hi = _mm_cvtsd_f64(t3_tmp1_lo_swapped);

    double t3 = t3_tmp1_lo_lo * t3_tmp1_lo_hi * t3_tmp1_hi_lo;

    COST_INC_MUL(2);
    double t4 = fabs(t3);
    COST_INC_ABS(1);

    double t5 = t2 * t4;
    COST_INC_MUL(1);

//    double t6 = -4.0*grey+grey1+grey2+grey3+grey4;

    double minusfour_times_grey = -4.0*greyc_hsum;
    double t6 = minusfour_times_grey+grey1234_sum;

    COST_INC_MUL(1);
    COST_INC_ADD(2); //2 operations saved due to AVX

    double t7 = fabs(t6);
    COST_INC_ABS(1);

    double t8 = t5 * t7;
    COST_INC_MUL(1);

    double t9 = t8 + 1.0E-12;
    COST_INC_ADD(1);

    return t9;
}
Esempio n. 10
0
			extern "C" YEP_PRIVATE_SYMBOL double sqrt(double x) {
				const __m128d xmm = _mm_set_sd(x);
				return _mm_cvtsd_f64(_mm_sqrt_sd(xmm, xmm));
			}