void test_vmaxQf32 (void) { float32x4_t out_float32x4_t; float32x4_t arg0_float32x4_t; float32x4_t arg1_float32x4_t; out_float32x4_t = vmaxq_f32 (arg0_float32x4_t, arg1_float32x4_t); }
inline v_float32x4 v_sqrt(const v_float32x4& x) { float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN)); float32x4_t e = vrsqrteq_f32(x1); e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e); e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e); return v_float32x4(vmulq_f32(x.val, e)); }
void qcms_transform_data_rgba_out_lut_neon(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length) { size_t i; unsigned char alpha; float32_t (*mat)[4] = transform->matrix; const float32_t *igtbl_r = (float32_t*)transform->input_gamma_table_r; const float32_t *igtbl_g = (float32_t*)transform->input_gamma_table_g; const float32_t *igtbl_b = (float32_t*)transform->input_gamma_table_b; const uint8_t *otdata_r = &transform->output_table_r->data[0]; const uint8_t *otdata_g = &transform->output_table_g->data[0]; const uint8_t *otdata_b = &transform->output_table_b->data[0]; const float32x4_t mat0 = vld1q_f32(mat[0]); const float32x4_t mat1 = vld1q_f32(mat[1]); const float32x4_t mat2 = vld1q_f32(mat[2]); const float32x4_t max = vld1q_dup_f32(&clampMaxValue); const float32x4_t min = vld1q_dup_f32(&zero); const float32x4_t scale = vld1q_dup_f32(&floatScale); float32x4_t vec_r, vec_g, vec_b; int32x4_t result; /* CYA */ if (!length) return; for (i = 0; i < length; i++) { /* setup for transforming the pixel */ vec_r = vld1q_dup_f32(&igtbl_r[*src++]); vec_g = vld1q_dup_f32(&igtbl_g[*src++]); vec_b = vld1q_dup_f32(&igtbl_b[*src++]); alpha = *src++; /* gamma * matrix */ vec_r = vmulq_f32(vec_r, mat0); vec_g = vmulq_f32(vec_g, mat1); vec_b = vmulq_f32(vec_b, mat2); /* crunch, crunch, crunch */ vec_r = vaddq_f32(vec_r, vaddq_f32(vec_g, vec_b)); vec_r = vmaxq_f32(min, vec_r); vec_r = vminq_f32(max, vec_r); result = vcvtq_s32_f32(vmulq_f32(vec_r, scale)); /* use calc'd indices to output RGB values */ *dest++ = otdata_r[vgetq_lane_s32(result, 0)]; *dest++ = otdata_g[vgetq_lane_s32(result, 1)]; *dest++ = otdata_b[vgetq_lane_s32(result, 2)]; *dest++ = alpha; } }
template <bool align> SIMD_INLINE void HogDirectionHistograms(const float32x4_t & dx, const float32x4_t & dy, Buffer & buffer, size_t col) { float32x4_t bestDot = vdupq_n_f32(0); int32x4_t bestIndex = vdupq_n_s32(0); for(int i = 0; i < buffer.size; ++i) { float32x4_t dot = vaddq_f32(vmulq_f32(dx, buffer.cos[i]), vmulq_f32(dy, buffer.sin[i])); uint32x4_t mask = vcgtq_f32(dot, bestDot); bestDot = vmaxq_f32(dot, bestDot); bestIndex = vbslq_s32(mask, buffer.pos[i], bestIndex); dot = vnegq_f32(dot); mask = vcgtq_f32(dot, bestDot); bestDot = vmaxq_f32(dot, bestDot); bestIndex = vbslq_s32(mask, buffer.neg[i], bestIndex); } Store<align>(buffer.index + col, bestIndex); Store<align>(buffer.value + col, Sqrt<SIMD_NEON_RCP_ITER>(vaddq_f32(vmulq_f32(dx, dx), vmulq_f32(dy, dy)))); }
dp::math::Box3f ManagerBitSet::calculateBoundingBox( const GroupSharedPtr& group ) const { #if defined(SSE) if ( useSSE ) { GroupBitSetSharedPtr groupImpl = std::static_pointer_cast<GroupBitSet>(group); __m128 minValue = _mm_set1_ps( std::numeric_limits<float>::signaling_NaN() ); __m128 maxValue = _mm_set1_ps( std::numeric_limits<float>::signaling_NaN() ); char const* basePtr = reinterpret_cast<char const*>(groupImpl->getMatrices()); for ( size_t index = 0;index < groupImpl->getObjectCount(); ++index ) { ObjectBitSetSharedPtr objectImpl = std::static_pointer_cast<ObjectBitSet>(groupImpl->getObject( index )); dp::math::sse::Mat44f const& modelView = *reinterpret_cast<dp::math::sse::Mat44f const*>(basePtr + objectImpl->getTransformIndex() * groupImpl->getMatricesStride()); dp::math::Vec4f const& extent = objectImpl->getExtent(); dp::math::sse::Vec4f vectors[8]; vectors[0] = *reinterpret_cast<dp::math::sse::Vec4f const*>(&objectImpl->getLowerLeft()) * modelView; dp::math::sse::Vec4f x( extent[0] * modelView[0] ); dp::math::sse::Vec4f y( extent[1] * modelView[1] ); dp::math::sse::Vec4f z( extent[2] * modelView[2] ); vectors[1] = vectors[0] + x; vectors[2] = vectors[0] + y; vectors[3] = vectors[1] + y; vectors[4] = vectors[0] + z; vectors[5] = vectors[1] + z; vectors[6] = vectors[2] + z; vectors[7] = vectors[3] + z; for ( unsigned int i = 0;i < 8; ++i ) { minValue = _mm_min_ps( minValue, vectors[i].sse() ); maxValue = _mm_max_ps( maxValue, vectors[i].sse() ); } } dp::math::Vec3f minVec, maxVec; _MM_EXTRACT_FLOAT( minVec[0], minValue, 0); _MM_EXTRACT_FLOAT( minVec[1], minValue, 1); _MM_EXTRACT_FLOAT( minVec[2], minValue, 2); _MM_EXTRACT_FLOAT( maxVec[0], maxValue, 0); _MM_EXTRACT_FLOAT( maxVec[1], maxValue, 1); _MM_EXTRACT_FLOAT( maxVec[2], maxValue, 2); return dp::math::Box3f( minVec, maxVec ); } else #elif defined(NEON) if ( useNEON ) { GroupBitSetSharedPtr groupImpl = std::static_pointer_cast<GroupBitSet>(group); float32x4_t minValue = vdupq_n_f32( std::numeric_limits<float>::max() ); float32x4_t maxValue = vdupq_n_f32( -std::numeric_limits<float>::max() ); char const* basePtr = reinterpret_cast<char const*>(groupImpl->getMatrices()); for ( size_t index = 0;index < groupImpl->getObjectCount(); ++index ) { const ObjectBitSetSharedPtr objectImpl = std::static_pointer_cast<ObjectBitSet>(groupImpl->getObject( index )); dp::math::neon::Mat44f const& modelView = *reinterpret_cast<dp::math::neon::Mat44f const*>(basePtr + objectImpl->getTransformIndex() * groupImpl->getMatricesStride()); dp::math::Vec4f const& extent = objectImpl->getExtent(); dp::math::neon::Vec4f vectors[8]; vectors[0] = *reinterpret_cast<dp::math::neon::Vec4f const*>(&objectImpl->getLowerLeft()) * modelView; dp::math::neon::Vec4f x( extent[0] * modelView[0] ); dp::math::neon::Vec4f y( extent[1] * modelView[1] ); dp::math::neon::Vec4f z( extent[2] * modelView[2] ); vectors[1] = vectors[0] + x; vectors[2] = vectors[0] + y; vectors[3] = vectors[1] + y; vectors[4] = vectors[0] + z; vectors[5] = vectors[1] + z; vectors[6] = vectors[2] + z; vectors[7] = vectors[3] + z; for ( unsigned int i = 0;i < 8; ++i ) { minValue = vminq_f32( minValue, vectors[i].neon() ); maxValue = vmaxq_f32( maxValue, vectors[i].neon() ); } } dp::math::Vec3f minVec, maxVec; vst1q_lane_f32( &minVec[0], minValue, 0); vst1q_lane_f32( &minVec[1], minValue, 1); vst1q_lane_f32( &minVec[2], minValue, 2); vst1q_lane_f32( &maxVec[0], maxValue, 0); vst1q_lane_f32( &maxVec[1], maxValue, 1); vst1q_lane_f32( &maxVec[2], maxValue, 2); return dp::math::Box3f( minVec, maxVec ); } else #endif // CPU fallback { GroupBitSetSharedPtr groupImpl = std::static_pointer_cast<GroupBitSet>(group); dp::math::Box4f boundingBox; char const* basePtr = reinterpret_cast<char const*>(groupImpl->getMatrices()); for ( size_t index = 0;index < groupImpl->getObjectCount(); ++index ) { const ObjectBitSetSharedPtr objectImpl = std::static_pointer_cast<ObjectBitSet>(groupImpl->getObject( index )); dp::math::Mat44f const& modelView = reinterpret_cast<dp::math::Mat44f const&>(*(basePtr + objectImpl->getTransformIndex() * groupImpl->getMatricesStride())); dp::math::Vec4f const& extent = objectImpl->getExtent(); dp::math::Vec4f vectors[8]; vectors[0] = (objectImpl->getLowerLeft() * modelView); dp::math::Vec4f x( extent[0] * modelView.getPtr()[0], extent[0] * modelView.getPtr()[1], extent[0] * modelView.getPtr()[2], extent[0] * modelView.getPtr()[3] ); dp::math::Vec4f y( extent[1] * modelView.getPtr()[4], extent[1] * modelView.getPtr()[5], extent[1] * modelView.getPtr()[6], extent[1] * modelView.getPtr()[7] ); dp::math::Vec4f z( extent[2] * modelView.getPtr()[8], extent[2] * modelView.getPtr()[9], extent[2] * modelView.getPtr()[10], extent[2] * modelView.getPtr()[11] ); vectors[1] = vectors[0] + x; vectors[2] = vectors[0] + y; vectors[3] = vectors[1] + y; vectors[4] = vectors[0] + z; vectors[5] = vectors[1] + z; vectors[6] = vectors[2] + z; vectors[7] = vectors[3] + z; for ( unsigned int i = 0;i < 8; ++i ) { boundingBox.update( vectors[i] ); } } dp::math::Vec4f lower = boundingBox.getLower(); dp::math::Vec4f upper = boundingBox.getUpper(); return dp::math::Box3f( dp::math::Vec3f( lower[0], lower[1], lower[2]), dp::math::Vec3f( upper[0], upper[1], upper[2])); } }
v4sf max_ps(v4sf a, v4sf b) { return vmaxq_f32(a, b); }
float32x4_t test_vmaxq_f32(float32x4_t a1, float32x4_t a2) { // CHECK: test_vmaxq_f32 return vmaxq_f32(a1, a2); // CHECK llvm.aarch64.neon.fmax.v4f32 }
static forcedinline ParallelType max (ParallelType a, ParallelType b) noexcept { return vmaxq_f32 (a, b); }
inline float32x4_t vmaxq(const float32x4_t & v0, const float32x4_t & v1) { return vmaxq_f32(v0, v1); }
float32x4_t test_vmaxq_f32 (float32x4_t __a, float32x4_t __b) { return vmaxq_f32(__a, __b); }
// Updates the following smoothed Power Spectral Densities (PSD): // - sd : near-end // - se : residual echo // - sx : far-end // - sde : cross-PSD of near-end and residual echo // - sxd : cross-PSD of near-end and far-end // // In addition to updating the PSDs, also the filter diverge state is determined // upon actions are taken. static void SmoothedPSD(AecCore* aec, float efw[2][PART_LEN1], float dfw[2][PART_LEN1], float xfw[2][PART_LEN1], int* extreme_filter_divergence) { // Power estimate smoothing coefficients. const float* ptrGCoh = aec->extended_filter_enabled ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; int i; float sdSum = 0, seSum = 0; const float32x4_t vec_15 = vdupq_n_f32(WebRtcAec_kMinFarendPSD); float32x4_t vec_sdSum = vdupq_n_f32(0.0f); float32x4_t vec_seSum = vdupq_n_f32(0.0f); for (i = 0; i + 3 < PART_LEN1; i += 4) { const float32x4_t vec_dfw0 = vld1q_f32(&dfw[0][i]); const float32x4_t vec_dfw1 = vld1q_f32(&dfw[1][i]); const float32x4_t vec_efw0 = vld1q_f32(&efw[0][i]); const float32x4_t vec_efw1 = vld1q_f32(&efw[1][i]); const float32x4_t vec_xfw0 = vld1q_f32(&xfw[0][i]); const float32x4_t vec_xfw1 = vld1q_f32(&xfw[1][i]); float32x4_t vec_sd = vmulq_n_f32(vld1q_f32(&aec->sd[i]), ptrGCoh[0]); float32x4_t vec_se = vmulq_n_f32(vld1q_f32(&aec->se[i]), ptrGCoh[0]); float32x4_t vec_sx = vmulq_n_f32(vld1q_f32(&aec->sx[i]), ptrGCoh[0]); float32x4_t vec_dfw_sumsq = vmulq_f32(vec_dfw0, vec_dfw0); float32x4_t vec_efw_sumsq = vmulq_f32(vec_efw0, vec_efw0); float32x4_t vec_xfw_sumsq = vmulq_f32(vec_xfw0, vec_xfw0); vec_dfw_sumsq = vmlaq_f32(vec_dfw_sumsq, vec_dfw1, vec_dfw1); vec_efw_sumsq = vmlaq_f32(vec_efw_sumsq, vec_efw1, vec_efw1); vec_xfw_sumsq = vmlaq_f32(vec_xfw_sumsq, vec_xfw1, vec_xfw1); vec_xfw_sumsq = vmaxq_f32(vec_xfw_sumsq, vec_15); vec_sd = vmlaq_n_f32(vec_sd, vec_dfw_sumsq, ptrGCoh[1]); vec_se = vmlaq_n_f32(vec_se, vec_efw_sumsq, ptrGCoh[1]); vec_sx = vmlaq_n_f32(vec_sx, vec_xfw_sumsq, ptrGCoh[1]); vst1q_f32(&aec->sd[i], vec_sd); vst1q_f32(&aec->se[i], vec_se); vst1q_f32(&aec->sx[i], vec_sx); { float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]); float32x4_t vec_dfwefw0011 = vmulq_f32(vec_dfw0, vec_efw0); float32x4_t vec_dfwefw0110 = vmulq_f32(vec_dfw0, vec_efw1); vec_sde.val[0] = vmulq_n_f32(vec_sde.val[0], ptrGCoh[0]); vec_sde.val[1] = vmulq_n_f32(vec_sde.val[1], ptrGCoh[0]); vec_dfwefw0011 = vmlaq_f32(vec_dfwefw0011, vec_dfw1, vec_efw1); vec_dfwefw0110 = vmlsq_f32(vec_dfwefw0110, vec_dfw1, vec_efw0); vec_sde.val[0] = vmlaq_n_f32(vec_sde.val[0], vec_dfwefw0011, ptrGCoh[1]); vec_sde.val[1] = vmlaq_n_f32(vec_sde.val[1], vec_dfwefw0110, ptrGCoh[1]); vst2q_f32(&aec->sde[i][0], vec_sde); } { float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]); float32x4_t vec_dfwxfw0011 = vmulq_f32(vec_dfw0, vec_xfw0); float32x4_t vec_dfwxfw0110 = vmulq_f32(vec_dfw0, vec_xfw1); vec_sxd.val[0] = vmulq_n_f32(vec_sxd.val[0], ptrGCoh[0]); vec_sxd.val[1] = vmulq_n_f32(vec_sxd.val[1], ptrGCoh[0]); vec_dfwxfw0011 = vmlaq_f32(vec_dfwxfw0011, vec_dfw1, vec_xfw1); vec_dfwxfw0110 = vmlsq_f32(vec_dfwxfw0110, vec_dfw1, vec_xfw0); vec_sxd.val[0] = vmlaq_n_f32(vec_sxd.val[0], vec_dfwxfw0011, ptrGCoh[1]); vec_sxd.val[1] = vmlaq_n_f32(vec_sxd.val[1], vec_dfwxfw0110, ptrGCoh[1]); vst2q_f32(&aec->sxd[i][0], vec_sxd); } vec_sdSum = vaddq_f32(vec_sdSum, vec_sd); vec_seSum = vaddq_f32(vec_seSum, vec_se); } { float32x2_t vec_sdSum_total; float32x2_t vec_seSum_total; // A B C D vec_sdSum_total = vpadd_f32(vget_low_f32(vec_sdSum), vget_high_f32(vec_sdSum)); vec_seSum_total = vpadd_f32(vget_low_f32(vec_seSum), vget_high_f32(vec_seSum)); // A+B C+D vec_sdSum_total = vpadd_f32(vec_sdSum_total, vec_sdSum_total); vec_seSum_total = vpadd_f32(vec_seSum_total, vec_seSum_total); // A+B+C+D A+B+C+D sdSum = vget_lane_f32(vec_sdSum_total, 0); seSum = vget_lane_f32(vec_seSum_total, 0); } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { aec->sd[i] = ptrGCoh[0] * aec->sd[i] + ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); aec->se[i] = ptrGCoh[0] * aec->se[i] + ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); // We threshold here to protect against the ill-effects of a zero farend. // The threshold is not arbitrarily chosen, but balances protection and // adverse interaction with the algorithm's tuning. // TODO(bjornv): investigate further why this is so sensitive. aec->sx[i] = ptrGCoh[0] * aec->sx[i] + ptrGCoh[1] * WEBRTC_SPL_MAX( xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], WebRtcAec_kMinFarendPSD); aec->sde[i][0] = ptrGCoh[0] * aec->sde[i][0] + ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); aec->sde[i][1] = ptrGCoh[0] * aec->sde[i][1] + ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); aec->sxd[i][0] = ptrGCoh[0] * aec->sxd[i][0] + ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]); aec->sxd[i][1] = ptrGCoh[0] * aec->sxd[i][1] + ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]); sdSum += aec->sd[i]; seSum += aec->se[i]; } // Divergent filter safeguard update. aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum; // Signal extreme filter divergence if the error is significantly larger // than the nearend (13 dB). *extreme_filter_divergence = (seSum > (19.95f * sdSum)); }
static float32x4_t vpowq_f32(float32x4_t a, float32x4_t b) { // a^b = exp2(b * log2(a)) // exp2(x) and log2(x) are calculated using polynomial approximations. float32x4_t log2_a, b_log2_a, a_exp_b; // Calculate log2(x), x = a. { // To calculate log2(x), we decompose x like this: // x = y * 2^n // n is an integer // y is in the [1.0, 2.0) range // // log2(x) = log2(y) + n // n can be evaluated by playing with float representation. // log2(y) in a small range can be approximated, this code uses an order // five polynomial approximation. The coefficients have been // estimated with the Remez algorithm and the resulting // polynomial has a maximum relative error of 0.00086%. // Compute n. // This is done by masking the exponent, shifting it into the top bit of // the mantissa, putting eight into the biased exponent (to shift/ // compensate the fact that the exponent has been shifted in the top/ // fractional part and finally getting rid of the implicit leading one // from the mantissa by substracting it out. const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000); const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000); const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000); const uint32x4_t two_n = vandq_u32(vreinterpretq_u32_f32(a), vec_float_exponent_mask); const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa); const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent); const float32x4_t n = vsubq_f32(vreinterpretq_f32_u32(n_0), vreinterpretq_f32_u32(vec_implicit_leading_one)); // Compute y. const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF); const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000); const uint32x4_t mantissa = vandq_u32(vreinterpretq_u32_f32(a), vec_mantissa_mask); const float32x4_t y = vreinterpretq_f32_u32(vorrq_u32(mantissa, vec_zero_biased_exponent_is_one)); // Approximate log2(y) ~= (y - 1) * pol5(y). // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f); const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f); const float32x4_t C3 = vdupq_n_f32(-1.2315303f); const float32x4_t C2 = vdupq_n_f32(2.5988452f); const float32x4_t C1 = vdupq_n_f32(-3.3241990f); const float32x4_t C0 = vdupq_n_f32(3.1157899f); float32x4_t pol5_y = C5; pol5_y = vmlaq_f32(C4, y, pol5_y); pol5_y = vmlaq_f32(C3, y, pol5_y); pol5_y = vmlaq_f32(C2, y, pol5_y); pol5_y = vmlaq_f32(C1, y, pol5_y); pol5_y = vmlaq_f32(C0, y, pol5_y); const float32x4_t y_minus_one = vsubq_f32(y, vreinterpretq_f32_u32(vec_zero_biased_exponent_is_one)); const float32x4_t log2_y = vmulq_f32(y_minus_one, pol5_y); // Combine parts. log2_a = vaddq_f32(n, log2_y); } // b * log2(a) b_log2_a = vmulq_f32(b, log2_a); // Calculate exp2(x), x = b * log2(a). { // To calculate 2^x, we decompose x like this: // x = n + y // n is an integer, the value of x - 0.5 rounded down, therefore // y is in the [0.5, 1.5) range // // 2^x = 2^n * 2^y // 2^n can be evaluated by playing with float representation. // 2^y in a small range can be approximated, this code uses an order two // polynomial approximation. The coefficients have been estimated // with the Remez algorithm and the resulting polynomial has a // maximum relative error of 0.17%. // To avoid over/underflow, we reduce the range of input to ]-127, 129]. const float32x4_t max_input = vdupq_n_f32(129.f); const float32x4_t min_input = vdupq_n_f32(-126.99999f); const float32x4_t x_min = vminq_f32(b_log2_a, max_input); const float32x4_t x_max = vmaxq_f32(x_min, min_input); // Compute n. const float32x4_t half = vdupq_n_f32(0.5f); const float32x4_t x_minus_half = vsubq_f32(x_max, half); const int32x4_t x_minus_half_floor = vcvtq_s32_f32(x_minus_half); // Compute 2^n. const int32x4_t float_exponent_bias = vdupq_n_s32(127); const int32x4_t two_n_exponent = vaddq_s32(x_minus_half_floor, float_exponent_bias); const float32x4_t two_n = vreinterpretq_f32_s32(vshlq_n_s32(two_n_exponent, kFloatExponentShift)); // Compute y. const float32x4_t y = vsubq_f32(x_max, vcvtq_f32_s32(x_minus_half_floor)); // Approximate 2^y ~= C2 * y^2 + C1 * y + C0. const float32x4_t C2 = vdupq_n_f32(3.3718944e-1f); const float32x4_t C1 = vdupq_n_f32(6.5763628e-1f); const float32x4_t C0 = vdupq_n_f32(1.0017247f); float32x4_t exp2_y = C2; exp2_y = vmlaq_f32(C1, y, exp2_y); exp2_y = vmlaq_f32(C0, y, exp2_y); // Combine parts. a_exp_b = vmulq_f32(exp2_y, two_n); } return a_exp_b; }