Beispiel #1
0
void qcms_transform_data_rgba_out_lut_neon(qcms_transform *transform,
                                           unsigned char *src,
                                           unsigned char *dest,
                                           size_t length)
{
  size_t i;
  unsigned char alpha;
  float32_t (*mat)[4] = transform->matrix;

  const float32_t *igtbl_r = (float32_t*)transform->input_gamma_table_r;
  const float32_t *igtbl_g = (float32_t*)transform->input_gamma_table_g;
  const float32_t *igtbl_b = (float32_t*)transform->input_gamma_table_b;

  const uint8_t *otdata_r = &transform->output_table_r->data[0];
  const uint8_t *otdata_g = &transform->output_table_g->data[0];
  const uint8_t *otdata_b = &transform->output_table_b->data[0];

  const float32x4_t mat0 = vld1q_f32(mat[0]);
  const float32x4_t mat1 = vld1q_f32(mat[1]);
  const float32x4_t mat2 = vld1q_f32(mat[2]);

  const float32x4_t max   = vld1q_dup_f32(&clampMaxValue);
  const float32x4_t min   = vld1q_dup_f32(&zero);
  const float32x4_t scale = vld1q_dup_f32(&floatScale);

  float32x4_t vec_r, vec_g, vec_b;
  int32x4_t result;

  /* CYA */
  if (!length)
    return;

  for (i = 0; i < length; i++) {
    /* setup for transforming the pixel */
    vec_r = vld1q_dup_f32(&igtbl_r[*src++]);
    vec_g = vld1q_dup_f32(&igtbl_g[*src++]);
    vec_b = vld1q_dup_f32(&igtbl_b[*src++]);
    alpha = *src++;

    /* gamma * matrix */
    vec_r = vmulq_f32(vec_r, mat0);
    vec_g = vmulq_f32(vec_g, mat1);
    vec_b = vmulq_f32(vec_b, mat2);

    /* crunch, crunch, crunch */
    vec_r = vaddq_f32(vec_r, vaddq_f32(vec_g, vec_b));
    vec_r = vmaxq_f32(min, vec_r);
    vec_r = vminq_f32(max, vec_r);
    result = vcvtq_s32_f32(vmulq_f32(vec_r, scale));

    /* use calc'd indices to output RGB values */
    *dest++ = otdata_r[vgetq_lane_s32(result, 0)];
    *dest++ = otdata_g[vgetq_lane_s32(result, 1)];
    *dest++ = otdata_b[vgetq_lane_s32(result, 2)];
    *dest++ = alpha;
  }
}
Beispiel #2
0
    dp::math::Box3f ManagerBitSet::calculateBoundingBox( const GroupSharedPtr& group ) const
    {
#if defined(SSE)
      if ( useSSE )
      {
        GroupBitSetSharedPtr groupImpl = std::static_pointer_cast<GroupBitSet>(group);

        __m128 minValue = _mm_set1_ps( std::numeric_limits<float>::signaling_NaN() );
        __m128 maxValue = _mm_set1_ps( std::numeric_limits<float>::signaling_NaN() );

        char const* basePtr = reinterpret_cast<char const*>(groupImpl->getMatrices());
        for ( size_t index = 0;index < groupImpl->getObjectCount(); ++index )
        {
          ObjectBitSetSharedPtr objectImpl = std::static_pointer_cast<ObjectBitSet>(groupImpl->getObject( index ));
          dp::math::sse::Mat44f const& modelView = *reinterpret_cast<dp::math::sse::Mat44f const*>(basePtr + objectImpl->getTransformIndex() * groupImpl->getMatricesStride());
          dp::math::Vec4f const& extent = objectImpl->getExtent();

          dp::math::sse::Vec4f vectors[8];
          vectors[0] = *reinterpret_cast<dp::math::sse::Vec4f const*>(&objectImpl->getLowerLeft()) * modelView;

          dp::math::sse::Vec4f x( extent[0] * modelView[0] );
          dp::math::sse::Vec4f y( extent[1] * modelView[1] );
          dp::math::sse::Vec4f z( extent[2] * modelView[2] );

          vectors[1] = vectors[0] + x;
          vectors[2] = vectors[0] + y;
          vectors[3] = vectors[1] + y;
          vectors[4] = vectors[0] + z;
          vectors[5] = vectors[1] + z;
          vectors[6] = vectors[2] + z;
          vectors[7] = vectors[3] + z;

          for ( unsigned int i = 0;i < 8; ++i )
          {
            minValue = _mm_min_ps( minValue, vectors[i].sse() );
            maxValue = _mm_max_ps( maxValue, vectors[i].sse() );
          }
        }

        dp::math::Vec3f minVec, maxVec;
        _MM_EXTRACT_FLOAT( minVec[0], minValue, 0);
        _MM_EXTRACT_FLOAT( minVec[1], minValue, 1);
        _MM_EXTRACT_FLOAT( minVec[2], minValue, 2);

        _MM_EXTRACT_FLOAT( maxVec[0], maxValue, 0);
        _MM_EXTRACT_FLOAT( maxVec[1], maxValue, 1);
        _MM_EXTRACT_FLOAT( maxVec[2], maxValue, 2);

        return dp::math::Box3f( minVec, maxVec );
      }
      else
#elif defined(NEON)
        if ( useNEON )
        {
          GroupBitSetSharedPtr groupImpl = std::static_pointer_cast<GroupBitSet>(group);

          float32x4_t minValue = vdupq_n_f32( std::numeric_limits<float>::max() );
          float32x4_t maxValue = vdupq_n_f32( -std::numeric_limits<float>::max() );

          char const* basePtr = reinterpret_cast<char const*>(groupImpl->getMatrices());
          for ( size_t index = 0;index < groupImpl->getObjectCount(); ++index )
          {
            const ObjectBitSetSharedPtr objectImpl = std::static_pointer_cast<ObjectBitSet>(groupImpl->getObject( index ));
            dp::math::neon::Mat44f const& modelView = *reinterpret_cast<dp::math::neon::Mat44f const*>(basePtr + objectImpl->getTransformIndex() * groupImpl->getMatricesStride());
            dp::math::Vec4f const& extent = objectImpl->getExtent();

            dp::math::neon::Vec4f vectors[8];
            vectors[0] = *reinterpret_cast<dp::math::neon::Vec4f const*>(&objectImpl->getLowerLeft()) * modelView;

            dp::math::neon::Vec4f x( extent[0] * modelView[0] );
            dp::math::neon::Vec4f y( extent[1] * modelView[1] );
            dp::math::neon::Vec4f z( extent[2] * modelView[2] );

            vectors[1] = vectors[0] + x;
            vectors[2] = vectors[0] + y;
            vectors[3] = vectors[1] + y;
            vectors[4] = vectors[0] + z;
            vectors[5] = vectors[1] + z;
            vectors[6] = vectors[2] + z;
            vectors[7] = vectors[3] + z;

            for ( unsigned int i = 0;i < 8; ++i )
            {
              minValue = vminq_f32( minValue, vectors[i].neon() );
              maxValue = vmaxq_f32( maxValue, vectors[i].neon() );
            }

          }

          dp::math::Vec3f minVec, maxVec;

          vst1q_lane_f32( &minVec[0], minValue, 0);
          vst1q_lane_f32( &minVec[1], minValue, 1);
          vst1q_lane_f32( &minVec[2], minValue, 2);

          vst1q_lane_f32( &maxVec[0], maxValue, 0);
          vst1q_lane_f32( &maxVec[1], maxValue, 1);
          vst1q_lane_f32( &maxVec[2], maxValue, 2);

          return dp::math::Box3f( minVec, maxVec );
        }
        else

#endif
      // CPU fallback
      {
        GroupBitSetSharedPtr groupImpl = std::static_pointer_cast<GroupBitSet>(group);

        dp::math::Box4f boundingBox;

        char const* basePtr = reinterpret_cast<char const*>(groupImpl->getMatrices());
        for ( size_t index = 0;index < groupImpl->getObjectCount(); ++index )
        {
          const ObjectBitSetSharedPtr objectImpl = std::static_pointer_cast<ObjectBitSet>(groupImpl->getObject( index ));
          dp::math::Mat44f const& modelView = reinterpret_cast<dp::math::Mat44f const&>(*(basePtr + objectImpl->getTransformIndex() * groupImpl->getMatricesStride()));
          dp::math::Vec4f const& extent = objectImpl->getExtent();

          dp::math::Vec4f vectors[8];
          vectors[0] = (objectImpl->getLowerLeft() * modelView);

          dp::math::Vec4f x( extent[0] * modelView.getPtr()[0], extent[0] * modelView.getPtr()[1], extent[0] * modelView.getPtr()[2], extent[0] * modelView.getPtr()[3] );
          dp::math::Vec4f y( extent[1] * modelView.getPtr()[4], extent[1] * modelView.getPtr()[5], extent[1] * modelView.getPtr()[6], extent[1] * modelView.getPtr()[7] );
          dp::math::Vec4f z( extent[2] * modelView.getPtr()[8], extent[2] * modelView.getPtr()[9], extent[2] * modelView.getPtr()[10], extent[2] * modelView.getPtr()[11] );

          vectors[1] = vectors[0] + x;
          vectors[2] = vectors[0] + y;
          vectors[3] = vectors[1] + y;
          vectors[4] = vectors[0] + z;
          vectors[5] = vectors[1] + z;
          vectors[6] = vectors[2] + z;
          vectors[7] = vectors[3] + z;

          for ( unsigned int i = 0;i < 8; ++i )
          {
            boundingBox.update( vectors[i] );
          }
        }

        dp::math::Vec4f lower = boundingBox.getLower();
        dp::math::Vec4f upper = boundingBox.getUpper();

        return dp::math::Box3f( dp::math::Vec3f( lower[0], lower[1], lower[2]), dp::math::Vec3f( upper[0], upper[1], upper[2]));
      }
    }
v4sf min_ps(v4sf a, v4sf b) {
  return vminq_f32(a, b);
}
Beispiel #4
0
static void
thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
{
    int i, j;
    Size roi = _src.size();
    roi.width *= _src.channels();
    const float* src = _src.ptr<float>();
    float* dst = _dst.ptr<float>();
    size_t src_step = _src.step/sizeof(src[0]);
    size_t dst_step = _dst.step/sizeof(dst[0]);

#if CV_SSE2
    volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE);
#endif

    if( _src.isContinuous() && _dst.isContinuous() )
    {
        roi.width *= roi.height;
        roi.height = 1;
    }

#ifdef HAVE_TEGRA_OPTIMIZATION
    if (tegra::thresh_32f(_src, _dst, roi.width, roi.height, thresh, maxval, type))
        return;
#endif

#if defined(HAVE_IPP)
    CV_IPP_CHECK()
    {
        IppiSize sz = { roi.width, roi.height };
        switch( type )
        {
        case THRESH_TRUNC:
            if (0 <= ippiThreshold_GT_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh))
            {
                CV_IMPL_ADD(CV_IMPL_IPP);
                return;
            }
            setIppErrorStatus();
            break;
        case THRESH_TOZERO:
            if (0 <= ippiThreshold_LTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh+FLT_EPSILON, 0))
            {
                CV_IMPL_ADD(CV_IMPL_IPP);
                return;
            }
            setIppErrorStatus();
            break;
        case THRESH_TOZERO_INV:
            if (0 <= ippiThreshold_GTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0))
            {
                CV_IMPL_ADD(CV_IMPL_IPP);
                return;
            }
            setIppErrorStatus();
            break;
        }
    }
#endif

    switch( type )
    {
        case THRESH_BINARY:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_cmpgt_ps( v0, thresh4 );
                        v1 = _mm_cmpgt_ps( v1, thresh4 );
                        v0 = _mm_and_ps( v0, maxval4 );
                        v1 = _mm_and_ps( v1, maxval4 );
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#elif CV_NEON
                float32x4_t v_thresh = vdupq_n_f32(thresh);
                uint32x4_t v_maxval = vreinterpretq_u32_f32(vdupq_n_f32(maxval));

                for( ; j <= roi.width - 4; j += 4 )
                {
                    float32x4_t v_src = vld1q_f32(src + j);
                    uint32x4_t v_dst = vandq_u32(vcgtq_f32(v_src, v_thresh), v_maxval);
                    vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
                }
#endif

                for( ; j < roi.width; j++ )
                    dst[j] = src[j] > thresh ? maxval : 0;
            }
            break;

        case THRESH_BINARY_INV:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_cmple_ps( v0, thresh4 );
                        v1 = _mm_cmple_ps( v1, thresh4 );
                        v0 = _mm_and_ps( v0, maxval4 );
                        v1 = _mm_and_ps( v1, maxval4 );
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#elif CV_NEON
                float32x4_t v_thresh = vdupq_n_f32(thresh);
                uint32x4_t v_maxval = vreinterpretq_u32_f32(vdupq_n_f32(maxval));

                for( ; j <= roi.width - 4; j += 4 )
                {
                    float32x4_t v_src = vld1q_f32(src + j);
                    uint32x4_t v_dst = vandq_u32(vcleq_f32(v_src, v_thresh), v_maxval);
                    vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
                }
#endif

                for( ; j < roi.width; j++ )
                    dst[j] = src[j] <= thresh ? maxval : 0;
            }
            break;

        case THRESH_TRUNC:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_min_ps( v0, thresh4 );
                        v1 = _mm_min_ps( v1, thresh4 );
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#elif CV_NEON
                float32x4_t v_thresh = vdupq_n_f32(thresh);

                for( ; j <= roi.width - 4; j += 4 )
                    vst1q_f32(dst + j, vminq_f32(vld1q_f32(src + j), v_thresh));
#endif

                for( ; j < roi.width; j++ )
                    dst[j] = std::min(src[j], thresh);
            }
            break;

        case THRESH_TOZERO:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_and_ps(v0, _mm_cmpgt_ps(v0, thresh4));
                        v1 = _mm_and_ps(v1, _mm_cmpgt_ps(v1, thresh4));
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#elif CV_NEON
                float32x4_t v_thresh = vdupq_n_f32(thresh);

                for( ; j <= roi.width - 4; j += 4 )
                {
                    float32x4_t v_src = vld1q_f32(src + j);
                    uint32x4_t v_dst = vandq_u32(vcgtq_f32(v_src, v_thresh),
                                                 vreinterpretq_u32_f32(v_src));
                    vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
                }
#endif

                for( ; j < roi.width; j++ )
                {
                    float v = src[j];
                    dst[j] = v > thresh ? v : 0;
                }
            }
            break;

        case THRESH_TOZERO_INV:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_and_ps(v0, _mm_cmple_ps(v0, thresh4));
                        v1 = _mm_and_ps(v1, _mm_cmple_ps(v1, thresh4));
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#elif CV_NEON
                float32x4_t v_thresh = vdupq_n_f32(thresh);

                for( ; j <= roi.width - 4; j += 4 )
                {
                    float32x4_t v_src = vld1q_f32(src + j);
                    uint32x4_t v_dst = vandq_u32(vcleq_f32(v_src, v_thresh),
                                                 vreinterpretq_u32_f32(v_src));
                    vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
                }
#endif
                for( ; j < roi.width; j++ )
                {
                    float v = src[j];
                    dst[j] = v <= thresh ? v : 0;
                }
            }
            break;
        default:
            return CV_Error( CV_StsBadArg, "" );
    }
}
 static forcedinline ParallelType min (ParallelType a, ParallelType b) noexcept  { return vminq_f32 (a, b); }
Beispiel #6
0
inline float32x4_t vminq(const float32x4_t & v0, const float32x4_t & v1) { return vminq_f32(v0, v1); }
Beispiel #7
0
float32x4_t
test_vminq_f32 (float32x4_t __a, float32x4_t __b)
{
  return vminq_f32(__a, __b);
}
static float32x4_t vpowq_f32(float32x4_t a, float32x4_t b) {
  // a^b = exp2(b * log2(a))
  //   exp2(x) and log2(x) are calculated using polynomial approximations.
  float32x4_t log2_a, b_log2_a, a_exp_b;

  // Calculate log2(x), x = a.
  {
    // To calculate log2(x), we decompose x like this:
    //   x = y * 2^n
    //     n is an integer
    //     y is in the [1.0, 2.0) range
    //
    //   log2(x) = log2(y) + n
    //     n       can be evaluated by playing with float representation.
    //     log2(y) in a small range can be approximated, this code uses an order
    //             five polynomial approximation. The coefficients have been
    //             estimated with the Remez algorithm and the resulting
    //             polynomial has a maximum relative error of 0.00086%.

    // Compute n.
    //    This is done by masking the exponent, shifting it into the top bit of
    //    the mantissa, putting eight into the biased exponent (to shift/
    //    compensate the fact that the exponent has been shifted in the top/
    //    fractional part and finally getting rid of the implicit leading one
    //    from the mantissa by substracting it out.
    const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000);
    const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000);
    const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000);
    const uint32x4_t two_n = vandq_u32(vreinterpretq_u32_f32(a),
                                       vec_float_exponent_mask);
    const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa);
    const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent);
    const float32x4_t n =
        vsubq_f32(vreinterpretq_f32_u32(n_0),
                  vreinterpretq_f32_u32(vec_implicit_leading_one));
    // Compute y.
    const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF);
    const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000);
    const uint32x4_t mantissa = vandq_u32(vreinterpretq_u32_f32(a),
                                          vec_mantissa_mask);
    const float32x4_t y =
        vreinterpretq_f32_u32(vorrq_u32(mantissa,
                                        vec_zero_biased_exponent_is_one));
    // Approximate log2(y) ~= (y - 1) * pol5(y).
    //    pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
    const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f);
    const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f);
    const float32x4_t C3 = vdupq_n_f32(-1.2315303f);
    const float32x4_t C2 = vdupq_n_f32(2.5988452f);
    const float32x4_t C1 = vdupq_n_f32(-3.3241990f);
    const float32x4_t C0 = vdupq_n_f32(3.1157899f);
    float32x4_t pol5_y = C5;
    pol5_y = vmlaq_f32(C4, y, pol5_y);
    pol5_y = vmlaq_f32(C3, y, pol5_y);
    pol5_y = vmlaq_f32(C2, y, pol5_y);
    pol5_y = vmlaq_f32(C1, y, pol5_y);
    pol5_y = vmlaq_f32(C0, y, pol5_y);
    const float32x4_t y_minus_one =
        vsubq_f32(y, vreinterpretq_f32_u32(vec_zero_biased_exponent_is_one));
    const float32x4_t log2_y = vmulq_f32(y_minus_one, pol5_y);

    // Combine parts.
    log2_a = vaddq_f32(n, log2_y);
  }

  // b * log2(a)
  b_log2_a = vmulq_f32(b, log2_a);

  // Calculate exp2(x), x = b * log2(a).
  {
    // To calculate 2^x, we decompose x like this:
    //   x = n + y
    //     n is an integer, the value of x - 0.5 rounded down, therefore
    //     y is in the [0.5, 1.5) range
    //
    //   2^x = 2^n * 2^y
    //     2^n can be evaluated by playing with float representation.
    //     2^y in a small range can be approximated, this code uses an order two
    //         polynomial approximation. The coefficients have been estimated
    //         with the Remez algorithm and the resulting polynomial has a
    //         maximum relative error of 0.17%.
    // To avoid over/underflow, we reduce the range of input to ]-127, 129].
    const float32x4_t max_input = vdupq_n_f32(129.f);
    const float32x4_t min_input = vdupq_n_f32(-126.99999f);
    const float32x4_t x_min = vminq_f32(b_log2_a, max_input);
    const float32x4_t x_max = vmaxq_f32(x_min, min_input);
    // Compute n.
    const float32x4_t half = vdupq_n_f32(0.5f);
    const float32x4_t x_minus_half = vsubq_f32(x_max, half);
    const int32x4_t x_minus_half_floor = vcvtq_s32_f32(x_minus_half);

    // Compute 2^n.
    const int32x4_t float_exponent_bias = vdupq_n_s32(127);
    const int32x4_t two_n_exponent =
        vaddq_s32(x_minus_half_floor, float_exponent_bias);
    const float32x4_t two_n =
        vreinterpretq_f32_s32(vshlq_n_s32(two_n_exponent, kFloatExponentShift));
    // Compute y.
    const float32x4_t y = vsubq_f32(x_max, vcvtq_f32_s32(x_minus_half_floor));

    // Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
    const float32x4_t C2 = vdupq_n_f32(3.3718944e-1f);
    const float32x4_t C1 = vdupq_n_f32(6.5763628e-1f);
    const float32x4_t C0 = vdupq_n_f32(1.0017247f);
    float32x4_t exp2_y = C2;
    exp2_y = vmlaq_f32(C1, y, exp2_y);
    exp2_y = vmlaq_f32(C0, y, exp2_y);

    // Combine parts.
    a_exp_b = vmulq_f32(exp2_y, two_n);
  }

  return a_exp_b;
}