/* AVX implementation for complex reciprocal */ inline __m256 srslte_mat_cf_recip_avx(__m256 a) { __m256 conj = _MM256_CONJ_PS(a); __m256 sqabs = _mm256_mul_ps(a, a); sqabs = _mm256_add_ps(_mm256_movehdup_ps(sqabs), _mm256_moveldup_ps(sqabs)); __m256 recp = _mm256_rcp_ps(sqabs); return _mm256_mul_ps(recp, conj); }
double Atomtype::CalcPE(int frame_i, const Trajectory &trj, const coordinates &rand_xyz, const cubicbox_m256 &box, double vol) const { float pe = 0.0; int atom_i = 0; /* BEGIN SIMD SECTION */ // This performs the exact same calculation after the SIMD section // but doing it on 8 atoms at a time using SIMD instructions. coordinates8 rand_xyz8(rand_xyz), atom_xyz; __m256 r2_8, mask, r6, ri6, pe_tmp; __m256 pe_sum = _mm256_setzero_ps(); float result[n] __attribute__((aligned (16))); for (; atom_i < this->n-8; atom_i+=8) { atom_xyz = trj.GetXYZ8(frame_i, this->name, atom_i); r2_8 = distance2(atom_xyz, rand_xyz8, box); mask = _mm256_cmp_ps(r2_8, rcut2_8, _CMP_LT_OS); r6 = _mm256_and_ps(mask, _mm256_mul_ps(_mm256_mul_ps(r2_8, r2_8), r2_8)); ri6 = _mm256_and_ps(mask, _mm256_rcp_ps(r6)); pe_tmp = _mm256_and_ps(mask, _mm256_mul_ps(ri6, _mm256_sub_ps(_mm256_mul_ps(c12_8, ri6), c6_8))); pe_sum = _mm256_add_ps(pe_tmp, pe_sum); } _mm256_store_ps(result, pe_sum); for (int i = 0; i < 8; i++) { pe += result[i]; } /* END SIMD SECTION */ for (; atom_i < this->n; atom_i++) { coordinates atom_xyz = trj.GetXYZ(frame_i, this->name, atom_i); float r2 = distance2(atom_xyz, rand_xyz, cubicbox(box)); if (r2 < this->rcut2) { float ri6 = 1.0/(pow(r2,3)); pe += ri6*(this->c12*ri6 - this->c6); } } pe += this->n/vol * this->tail_factor;; return pe; }
/*---------------------------------------------------------------------------*/ __m256 TTriangle::THit::HitTest8(__m256 mask, const TPoint8& orig, const D3DXVECTOR3& d, HitResult8* result) const { int u, v, w; w = ci; u = w == 0 ? 1 : 0; v = w == 2 ? 1 : 2; __m256 nu = _mm256_broadcast_ss(&this->nu); __m256 np = _mm256_broadcast_ss(&this->np); __m256 nv = _mm256_broadcast_ss(&this->nv); __m256 pu = _mm256_broadcast_ss(&this->pu); __m256 pv = _mm256_broadcast_ss(&this->pv); __m256 e0u = _mm256_broadcast_ss(&this->e0u); __m256 e0v = _mm256_broadcast_ss(&this->e0v); __m256 e1u = _mm256_broadcast_ss(&this->e1u); __m256 e1v = _mm256_broadcast_ss(&this->e1v); __m256 ou = orig[u]; __m256 ov = orig[v]; __m256 ow = orig[w]; __m256 du = _mm256_broadcast_ss(&d[u]); __m256 dv = _mm256_broadcast_ss(&d[v]); __m256 dw = _mm256_broadcast_ss(&d[w]); __m256 dett = np -(ou*nu+ov*nv+ow); __m256 det = du*nu+dv*nv+dw; __m256 Du = du*dett - (pu-ou)*det; __m256 Dv = dv*dett - (pv-ov)*det; __m256 detu = (e1v*Du - e1u*Dv); __m256 detv = (e0u*Dv - e0v*Du); __m256 tmpdet0 = det - detu - detv; __m256 detMask = _mm256_xor_ps(_mm256_xor_ps(tmpdet0, detv) | _mm256_xor_ps(detv, detu), g_one8) > _mm256_setzero_ps(); mask = mask & detMask; __m256 rdet = _mm256_rcp_ps(det); result->t = dett * rdet; result->u = detu * rdet; result->v = detv * rdet; return mask & (result->t > _mm256_setzero_ps()); /**/ }
void neuralNet::activation_approx_avx(const float* _neuronOutput, float* result) { BOOST_STATIC_ASSERT(SIGMOIDCOEFFICIENT == 4.0f); // code adapted from http://ybeernet.blogspot.com/2011/03/speeding-up-sigmoid-function-by.html // approximates sigmoid function with coefficient 4.0f static const __m256 ones = _mm256_set1_ps(1.0f); static const __m256 oneFourths = _mm256_set1_ps(0.25f); static const __m256 fours = _mm256_set1_ps(4.0f); __m256 temp; const __m256* vOutput = (__m256*)_neuronOutput; // min (output, 4.0) temp = _mm256_min_ps(*vOutput, fours); // multiply by 0.25 temp = _mm256_mul_ps(temp, oneFourths); // 1 - ans temp = _mm256_sub_ps(ones, temp); // ans^16 temp = _mm256_mul_ps(temp, temp); temp = _mm256_mul_ps(temp, temp); temp = _mm256_mul_ps(temp, temp); temp = _mm256_mul_ps(temp, temp); // 1 + ans temp = _mm256_add_ps(ones, temp); // 1 / ans temp = _mm256_rcp_ps(temp); #ifndef NDEBUG const float* _temp = (float*)&temp; assert(fastabs(_temp[0] - activation(_neuronOutput[0])) < 0.05f); assert(fastabs(_temp[1] - activation(_neuronOutput[1])) < 0.05f); assert(fastabs(_temp[2] - activation(_neuronOutput[2])) < 0.05f); assert(fastabs(_temp[3] - activation(_neuronOutput[3])) < 0.05f); assert(fastabs(_temp[4] - activation(_neuronOutput[4])) < 0.05f); assert(fastabs(_temp[5] - activation(_neuronOutput[5])) < 0.05f); assert(fastabs(_temp[6] - activation(_neuronOutput[6])) < 0.05f); assert(fastabs(_temp[7] - activation(_neuronOutput[7])) < 0.05f); #endif // return ans _mm256_store_ps(result, temp); };
CPLErr GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX( const void *poOptions, GUInt32 nPoints, CPL_UNUSED const double *unused_padfX, CPL_UNUSED const double *unused_padfY, CPL_UNUSED const double *unused_padfZ, double dfXPoint, double dfYPoint, double *pdfValue, void* hExtraParamsIn ) { size_t i = 0; GDALGridExtraParameters* psExtraParams = (GDALGridExtraParameters*) hExtraParamsIn; const float* pafX = psExtraParams->pafX; const float* pafY = psExtraParams->pafY; const float* pafZ = psExtraParams->pafZ; const float fEpsilon = 0.0000000000001f; const float fXPoint = (float)dfXPoint; const float fYPoint = (float)dfYPoint; const __m256 ymm_small = GDAL_mm256_load1_ps(fEpsilon); const __m256 ymm_x = GDAL_mm256_load1_ps(fXPoint); const __m256 ymm_y = GDAL_mm256_load1_ps(fYPoint); __m256 ymm_nominator = _mm256_setzero_ps(); __m256 ymm_denominator = _mm256_setzero_ps(); int mask = 0; #undef LOOP_SIZE #if defined(__x86_64) || defined(_M_X64) /* This would also work in 32bit mode, but there are only 8 XMM registers */ /* whereas we have 16 for 64bit */ #define LOOP_SIZE 16 size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; for ( i = 0; i < nPointsRound; i += LOOP_SIZE ) { __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps(pafX + i), ymm_x); /* rx = pafX[i] - fXPoint */ __m256 ymm_rx_8 = _mm256_sub_ps(_mm256_load_ps(pafX + i + 8), ymm_x); __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps(pafY + i), ymm_y); /* ry = pafY[i] - fYPoint */ __m256 ymm_ry_8 = _mm256_sub_ps(_mm256_load_ps(pafY + i + 8), ymm_y); __m256 ymm_r2 = _mm256_add_ps(_mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */ _mm256_mul_ps(ymm_ry, ymm_ry)); __m256 ymm_r2_8 = _mm256_add_ps(_mm256_mul_ps(ymm_rx_8, ymm_rx_8), _mm256_mul_ps(ymm_ry_8, ymm_ry_8)); __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */ __m256 ymm_invr2_8 = _mm256_rcp_ps(ymm_r2_8); ymm_nominator = _mm256_add_ps(ymm_nominator, /* nominator += invr2 * pafZ[i] */ _mm256_mul_ps(ymm_invr2, _mm256_load_ps(pafZ + i))); ymm_nominator = _mm256_add_ps(ymm_nominator, _mm256_mul_ps(ymm_invr2_8, _mm256_load_ps(pafZ + i + 8))); ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2); /* denominator += invr2 */ ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2_8); mask = _mm256_movemask_ps(_mm256_cmp_ps(ymm_r2, ymm_small, _CMP_LT_OS)) | /* if( r2 < fEpsilon) */ (_mm256_movemask_ps(_mm256_cmp_ps(ymm_r2_8, ymm_small, _CMP_LT_OS)) << 8); if( mask ) break; } #else #define LOOP_SIZE 8 size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; for ( i = 0; i < nPointsRound; i += LOOP_SIZE ) { __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps((float*)pafX + i), ymm_x); /* rx = pafX[i] - fXPoint */ __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps((float*)pafY + i), ymm_y); /* ry = pafY[i] - fYPoint */ __m256 ymm_r2 = _mm256_add_ps(_mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */ _mm256_mul_ps(ymm_ry, ymm_ry)); __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */ ymm_nominator = _mm256_add_ps(ymm_nominator, /* nominator += invr2 * pafZ[i] */ _mm256_mul_ps(ymm_invr2, _mm256_load_ps((float*)pafZ + i))); ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2); /* denominator += invr2 */ mask = _mm256_movemask_ps(_mm256_cmp_ps(ymm_r2, ymm_small, _CMP_LT_OS)); /* if( r2 < fEpsilon) */ if( mask ) break; } #endif /* Find which i triggered r2 < fEpsilon */ if( mask ) { for(int j = 0; j < LOOP_SIZE; j++ ) { if( mask & (1 << j) ) { (*pdfValue) = (pafZ)[i + j]; // GCC and MSVC need explicit zeroing #if !defined(__clang__) _mm256_zeroupper(); #endif return CE_None; } } } #undef LOOP_SIZE /* Get back nominator and denominator values for YMM registers */ float afNominator[8], afDenominator[8]; _mm256_storeu_ps(afNominator, ymm_nominator); _mm256_storeu_ps(afDenominator, ymm_denominator); // MSVC doesn't emit AVX afterwards but may use SSE, so clear upper bits // Other compilers will continue using AVX for the below floating points operations #if defined(_MSC_FULL_VER) _mm256_zeroupper(); #endif float fNominator = afNominator[0] + afNominator[1] + afNominator[2] + afNominator[3] + afNominator[4] + afNominator[5] + afNominator[6] + afNominator[7]; float fDenominator = afDenominator[0] + afDenominator[1] + afDenominator[2] + afDenominator[3] + afDenominator[4] + afDenominator[5] + afDenominator[6] + afDenominator[7]; /* Do the few remaining loop iterations */ for ( ; i < nPoints; i++ ) { const float fRX = pafX[i] - fXPoint; const float fRY = pafY[i] - fYPoint; const float fR2 = fRX * fRX + fRY * fRY; // If the test point is close to the grid node, use the point // value directly as a node value to avoid singularity. if ( fR2 < 0.0000000000001 ) { break; } else { const float fInvR2 = 1.0f / fR2; fNominator += fInvR2 * pafZ[i]; fDenominator += fInvR2; } } if( i != nPoints ) { (*pdfValue) = pafZ[i]; } else if ( fDenominator == 0.0 ) { (*pdfValue) = ((GDALGridInverseDistanceToAPowerOptions*)poOptions)->dfNoDataValue; } else (*pdfValue) = fNominator / fDenominator; // GCC needs explicit zeroing #if defined(__GNUC__) && !defined(__clang__) _mm256_zeroupper(); #endif return CE_None; }