Esempio n. 1
0
/* AVX implementation for complex reciprocal */
inline __m256 srslte_mat_cf_recip_avx(__m256 a) {
  __m256 conj = _MM256_CONJ_PS(a);
  __m256 sqabs = _mm256_mul_ps(a, a);
  sqabs = _mm256_add_ps(_mm256_movehdup_ps(sqabs), _mm256_moveldup_ps(sqabs));

  __m256 recp = _mm256_rcp_ps(sqabs);

  return _mm256_mul_ps(recp, conj);
}
Esempio n. 2
0
double Atomtype::CalcPE(int frame_i, const Trajectory &trj, const coordinates &rand_xyz, const cubicbox_m256 &box, double vol) const
{
    float pe = 0.0;
    int atom_i = 0;

    /* BEGIN SIMD SECTION */
    // This performs the exact same calculation after the SIMD section
    // but doing it on 8 atoms at a time using SIMD instructions.

    coordinates8 rand_xyz8(rand_xyz), atom_xyz;
    __m256 r2_8, mask, r6, ri6, pe_tmp;
    __m256 pe_sum = _mm256_setzero_ps();
    float result[n] __attribute__((aligned (16)));

    for (; atom_i < this->n-8; atom_i+=8)
    {
        atom_xyz = trj.GetXYZ8(frame_i, this->name, atom_i);
        r2_8 = distance2(atom_xyz, rand_xyz8, box);
        mask = _mm256_cmp_ps(r2_8, rcut2_8, _CMP_LT_OS);
        r6 = _mm256_and_ps(mask, _mm256_mul_ps(_mm256_mul_ps(r2_8, r2_8), r2_8));
        ri6 = _mm256_and_ps(mask, _mm256_rcp_ps(r6));
        pe_tmp = _mm256_and_ps(mask, _mm256_mul_ps(ri6, _mm256_sub_ps(_mm256_mul_ps(c12_8, ri6), c6_8)));
        pe_sum = _mm256_add_ps(pe_tmp, pe_sum);
    }
    _mm256_store_ps(result, pe_sum);
    for (int i = 0; i < 8; i++)
    {
        pe += result[i];
    }

    /* END SIMD SECTION */

    for (; atom_i < this->n; atom_i++)
    {
        coordinates atom_xyz = trj.GetXYZ(frame_i, this->name, atom_i);
        float r2 = distance2(atom_xyz, rand_xyz, cubicbox(box));
        if (r2 < this->rcut2)
        {
            float ri6 = 1.0/(pow(r2,3));
            pe += ri6*(this->c12*ri6 - this->c6);
        }
    }

    pe += this->n/vol * this->tail_factor;;

    return pe;
}
Esempio n. 3
0
/*---------------------------------------------------------------------------*/
__m256 TTriangle::THit::HitTest8(__m256 mask, const TPoint8& orig, const D3DXVECTOR3& d, HitResult8* result) const
{
    int u, v, w;
    w = ci;
    u = w == 0 ? 1 : 0;
    v = w == 2 ? 1 : 2;

    __m256 nu = _mm256_broadcast_ss(&this->nu);
    __m256 np = _mm256_broadcast_ss(&this->np);
    __m256 nv = _mm256_broadcast_ss(&this->nv);
    __m256 pu = _mm256_broadcast_ss(&this->pu);
    __m256 pv = _mm256_broadcast_ss(&this->pv);
    __m256 e0u = _mm256_broadcast_ss(&this->e0u);
    __m256 e0v = _mm256_broadcast_ss(&this->e0v);
    __m256 e1u = _mm256_broadcast_ss(&this->e1u);
    __m256 e1v = _mm256_broadcast_ss(&this->e1v);

    __m256 ou = orig[u];
    __m256 ov = orig[v];
    __m256 ow = orig[w];
    __m256 du = _mm256_broadcast_ss(&d[u]);
    __m256 dv = _mm256_broadcast_ss(&d[v]);
    __m256 dw = _mm256_broadcast_ss(&d[w]);

    __m256 dett = np -(ou*nu+ov*nv+ow);
    __m256 det = du*nu+dv*nv+dw;
    __m256 Du = du*dett - (pu-ou)*det;
    __m256 Dv = dv*dett - (pv-ov)*det;
    __m256 detu = (e1v*Du - e1u*Dv);
    __m256 detv = (e0u*Dv - e0v*Du);

    __m256 tmpdet0 = det - detu - detv;

    __m256 detMask = _mm256_xor_ps(_mm256_xor_ps(tmpdet0, detv) | _mm256_xor_ps(detv, detu), g_one8) > _mm256_setzero_ps();

    mask = mask & detMask;

    __m256 rdet = _mm256_rcp_ps(det);

    result->t = dett * rdet;
    result->u = detu * rdet;
    result->v = detv * rdet;

    return mask & (result->t > _mm256_setzero_ps());
    /**/
}
Esempio n. 4
0
void neuralNet::activation_approx_avx(const float* _neuronOutput, float* result)
{
	BOOST_STATIC_ASSERT(SIGMOIDCOEFFICIENT == 4.0f);
	// code adapted from http://ybeernet.blogspot.com/2011/03/speeding-up-sigmoid-function-by.html
	// approximates sigmoid function with coefficient 4.0f
	static const __m256 ones = _mm256_set1_ps(1.0f);
	static const __m256 oneFourths = _mm256_set1_ps(0.25f);
	static const __m256 fours = _mm256_set1_ps(4.0f);

	__m256 temp;
	const __m256* vOutput = (__m256*)_neuronOutput;

	// min (output, 4.0)
	temp = _mm256_min_ps(*vOutput, fours);
	// multiply by 0.25
	temp = _mm256_mul_ps(temp, oneFourths);
	// 1 - ans
	temp = _mm256_sub_ps(ones, temp);
	// ans^16
	temp = _mm256_mul_ps(temp, temp);
	temp = _mm256_mul_ps(temp, temp);
	temp = _mm256_mul_ps(temp, temp);
	temp = _mm256_mul_ps(temp, temp);
	// 1 + ans
	temp = _mm256_add_ps(ones, temp);
	// 1 / ans
	temp = _mm256_rcp_ps(temp);

#ifndef NDEBUG
	const float* _temp = (float*)&temp;
	assert(fastabs(_temp[0] - activation(_neuronOutput[0])) < 0.05f);
	assert(fastabs(_temp[1] - activation(_neuronOutput[1])) < 0.05f);
	assert(fastabs(_temp[2] - activation(_neuronOutput[2])) < 0.05f);
	assert(fastabs(_temp[3] - activation(_neuronOutput[3])) < 0.05f);
	assert(fastabs(_temp[4] - activation(_neuronOutput[4])) < 0.05f);
	assert(fastabs(_temp[5] - activation(_neuronOutput[5])) < 0.05f);
	assert(fastabs(_temp[6] - activation(_neuronOutput[6])) < 0.05f);
	assert(fastabs(_temp[7] - activation(_neuronOutput[7])) < 0.05f);
#endif

	// return ans
	_mm256_store_ps(result, temp);
};
Esempio n. 5
0
CPLErr
GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX(
    const void *poOptions,
    GUInt32 nPoints,
    CPL_UNUSED const double *unused_padfX,
    CPL_UNUSED const double *unused_padfY,
    CPL_UNUSED const double *unused_padfZ,
    double dfXPoint, double dfYPoint,
    double *pdfValue,
    void* hExtraParamsIn )
{
    size_t i = 0;
    GDALGridExtraParameters* psExtraParams = (GDALGridExtraParameters*) hExtraParamsIn;
    const float* pafX = psExtraParams->pafX;
    const float* pafY = psExtraParams->pafY;
    const float* pafZ = psExtraParams->pafZ;

    const float fEpsilon = 0.0000000000001f;
    const float fXPoint = (float)dfXPoint;
    const float fYPoint = (float)dfYPoint;
    const __m256 ymm_small = GDAL_mm256_load1_ps(fEpsilon);
    const __m256 ymm_x = GDAL_mm256_load1_ps(fXPoint);
    const __m256 ymm_y = GDAL_mm256_load1_ps(fYPoint);
    __m256 ymm_nominator = _mm256_setzero_ps();
    __m256 ymm_denominator = _mm256_setzero_ps();
    int mask = 0;

#undef LOOP_SIZE
#if defined(__x86_64) || defined(_M_X64)
    /* This would also work in 32bit mode, but there are only 8 XMM registers */
    /* whereas we have 16 for 64bit */
#define LOOP_SIZE   16
    size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
    for ( i = 0; i < nPointsRound; i += LOOP_SIZE )
    {
        __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps(pafX + i), ymm_x);            /* rx = pafX[i] - fXPoint */
        __m256 ymm_rx_8 = _mm256_sub_ps(_mm256_load_ps(pafX + i + 8), ymm_x);
        __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps(pafY + i), ymm_y);            /* ry = pafY[i] - fYPoint */
        __m256 ymm_ry_8 = _mm256_sub_ps(_mm256_load_ps(pafY + i + 8), ymm_y);
        __m256 ymm_r2 = _mm256_add_ps(_mm256_mul_ps(ymm_rx, ymm_rx),               /* r2 = rx * rx + ry * ry */
                                   _mm256_mul_ps(ymm_ry, ymm_ry));
        __m256 ymm_r2_8 = _mm256_add_ps(_mm256_mul_ps(ymm_rx_8, ymm_rx_8),
                                     _mm256_mul_ps(ymm_ry_8, ymm_ry_8));
        __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2);                               /* invr2 = 1.0f / r2 */
        __m256 ymm_invr2_8 = _mm256_rcp_ps(ymm_r2_8);
        ymm_nominator = _mm256_add_ps(ymm_nominator,                            /* nominator += invr2 * pafZ[i] */
                            _mm256_mul_ps(ymm_invr2, _mm256_load_ps(pafZ + i)));
        ymm_nominator = _mm256_add_ps(ymm_nominator,
                            _mm256_mul_ps(ymm_invr2_8, _mm256_load_ps(pafZ + i + 8)));
        ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2);           /* denominator += invr2 */
        ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2_8);
        mask = _mm256_movemask_ps(_mm256_cmp_ps(ymm_r2, ymm_small, _CMP_LT_OS)) |           /* if( r2 < fEpsilon) */
              (_mm256_movemask_ps(_mm256_cmp_ps(ymm_r2_8, ymm_small, _CMP_LT_OS)) << 8);
        if( mask )
            break;
    }
#else
#define LOOP_SIZE   8
    size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE;
    for ( i = 0; i < nPointsRound; i += LOOP_SIZE )
    {
        __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps((float*)pafX + i), ymm_x);           /* rx = pafX[i] - fXPoint */
        __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps((float*)pafY + i), ymm_y);           /* ry = pafY[i] - fYPoint */
        __m256 ymm_r2 = _mm256_add_ps(_mm256_mul_ps(ymm_rx, ymm_rx),              /* r2 = rx * rx + ry * ry */
                                   _mm256_mul_ps(ymm_ry, ymm_ry));
        __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2);                              /* invr2 = 1.0f / r2 */
        ymm_nominator = _mm256_add_ps(ymm_nominator,                           /* nominator += invr2 * pafZ[i] */
                            _mm256_mul_ps(ymm_invr2, _mm256_load_ps((float*)pafZ + i)));
        ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2);           /* denominator += invr2 */
        mask = _mm256_movemask_ps(_mm256_cmp_ps(ymm_r2, ymm_small, _CMP_LT_OS));            /* if( r2 < fEpsilon) */
        if( mask )
            break;
    }
#endif

    /* Find which i triggered r2 < fEpsilon */
    if( mask )
    {
        for(int j = 0; j < LOOP_SIZE; j++ )
        {
            if( mask & (1 << j) )
            {
                (*pdfValue) = (pafZ)[i + j];

                // GCC and MSVC need explicit zeroing
#if !defined(__clang__)
                _mm256_zeroupper();
#endif
                return CE_None;
            }
        }
    }
#undef LOOP_SIZE

    /* Get back nominator and denominator values for YMM registers */
    float afNominator[8], afDenominator[8];
    _mm256_storeu_ps(afNominator, ymm_nominator);
    _mm256_storeu_ps(afDenominator, ymm_denominator);

    // MSVC doesn't emit AVX afterwards but may use SSE, so clear upper bits
    // Other compilers will continue using AVX for the below floating points operations
#if defined(_MSC_FULL_VER)
    _mm256_zeroupper();
#endif

    float fNominator = afNominator[0] + afNominator[1] +
                       afNominator[2] + afNominator[3] +
                       afNominator[4] + afNominator[5] +
                       afNominator[6] + afNominator[7];
    float fDenominator = afDenominator[0] + afDenominator[1] +
                         afDenominator[2] + afDenominator[3] +
                         afDenominator[4] + afDenominator[5] +
                         afDenominator[6] + afDenominator[7];

    /* Do the few remaining loop iterations */
    for ( ; i < nPoints; i++ )
    {
        const float fRX = pafX[i] - fXPoint;
        const float fRY = pafY[i] - fYPoint;
        const float fR2 =
            fRX * fRX + fRY * fRY;

        // If the test point is close to the grid node, use the point
        // value directly as a node value to avoid singularity.
        if ( fR2 < 0.0000000000001 )
        {
            break;
        }
        else
        {
            const float fInvR2 = 1.0f / fR2;
            fNominator += fInvR2 * pafZ[i];
            fDenominator += fInvR2;
        }
    }

    if( i != nPoints )
    {
        (*pdfValue) = pafZ[i];
    }
    else
    if ( fDenominator == 0.0 )
    {
        (*pdfValue) =
            ((GDALGridInverseDistanceToAPowerOptions*)poOptions)->dfNoDataValue;
    }
    else
        (*pdfValue) = fNominator / fDenominator;

    // GCC needs explicit zeroing
#if defined(__GNUC__) && !defined(__clang__)
    _mm256_zeroupper();
#endif

    return CE_None;
}