void qcms_transform_data_rgba_out_lut_neon(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length) { size_t i; unsigned char alpha; float32_t (*mat)[4] = transform->matrix; const float32_t *igtbl_r = (float32_t*)transform->input_gamma_table_r; const float32_t *igtbl_g = (float32_t*)transform->input_gamma_table_g; const float32_t *igtbl_b = (float32_t*)transform->input_gamma_table_b; const uint8_t *otdata_r = &transform->output_table_r->data[0]; const uint8_t *otdata_g = &transform->output_table_g->data[0]; const uint8_t *otdata_b = &transform->output_table_b->data[0]; const float32x4_t mat0 = vld1q_f32(mat[0]); const float32x4_t mat1 = vld1q_f32(mat[1]); const float32x4_t mat2 = vld1q_f32(mat[2]); const float32x4_t max = vld1q_dup_f32(&clampMaxValue); const float32x4_t min = vld1q_dup_f32(&zero); const float32x4_t scale = vld1q_dup_f32(&floatScale); float32x4_t vec_r, vec_g, vec_b; int32x4_t result; /* CYA */ if (!length) return; for (i = 0; i < length; i++) { /* setup for transforming the pixel */ vec_r = vld1q_dup_f32(&igtbl_r[*src++]); vec_g = vld1q_dup_f32(&igtbl_g[*src++]); vec_b = vld1q_dup_f32(&igtbl_b[*src++]); alpha = *src++; /* gamma * matrix */ vec_r = vmulq_f32(vec_r, mat0); vec_g = vmulq_f32(vec_g, mat1); vec_b = vmulq_f32(vec_b, mat2); /* crunch, crunch, crunch */ vec_r = vaddq_f32(vec_r, vaddq_f32(vec_g, vec_b)); vec_r = vmaxq_f32(min, vec_r); vec_r = vminq_f32(max, vec_r); result = vcvtq_s32_f32(vmulq_f32(vec_r, scale)); /* use calc'd indices to output RGB values */ *dest++ = otdata_r[vgetq_lane_s32(result, 0)]; *dest++ = otdata_g[vgetq_lane_s32(result, 1)]; *dest++ = otdata_b[vgetq_lane_s32(result, 2)]; *dest++ = alpha; } }
dp::math::Box3f ManagerBitSet::calculateBoundingBox( const GroupSharedPtr& group ) const { #if defined(SSE) if ( useSSE ) { GroupBitSetSharedPtr groupImpl = std::static_pointer_cast<GroupBitSet>(group); __m128 minValue = _mm_set1_ps( std::numeric_limits<float>::signaling_NaN() ); __m128 maxValue = _mm_set1_ps( std::numeric_limits<float>::signaling_NaN() ); char const* basePtr = reinterpret_cast<char const*>(groupImpl->getMatrices()); for ( size_t index = 0;index < groupImpl->getObjectCount(); ++index ) { ObjectBitSetSharedPtr objectImpl = std::static_pointer_cast<ObjectBitSet>(groupImpl->getObject( index )); dp::math::sse::Mat44f const& modelView = *reinterpret_cast<dp::math::sse::Mat44f const*>(basePtr + objectImpl->getTransformIndex() * groupImpl->getMatricesStride()); dp::math::Vec4f const& extent = objectImpl->getExtent(); dp::math::sse::Vec4f vectors[8]; vectors[0] = *reinterpret_cast<dp::math::sse::Vec4f const*>(&objectImpl->getLowerLeft()) * modelView; dp::math::sse::Vec4f x( extent[0] * modelView[0] ); dp::math::sse::Vec4f y( extent[1] * modelView[1] ); dp::math::sse::Vec4f z( extent[2] * modelView[2] ); vectors[1] = vectors[0] + x; vectors[2] = vectors[0] + y; vectors[3] = vectors[1] + y; vectors[4] = vectors[0] + z; vectors[5] = vectors[1] + z; vectors[6] = vectors[2] + z; vectors[7] = vectors[3] + z; for ( unsigned int i = 0;i < 8; ++i ) { minValue = _mm_min_ps( minValue, vectors[i].sse() ); maxValue = _mm_max_ps( maxValue, vectors[i].sse() ); } } dp::math::Vec3f minVec, maxVec; _MM_EXTRACT_FLOAT( minVec[0], minValue, 0); _MM_EXTRACT_FLOAT( minVec[1], minValue, 1); _MM_EXTRACT_FLOAT( minVec[2], minValue, 2); _MM_EXTRACT_FLOAT( maxVec[0], maxValue, 0); _MM_EXTRACT_FLOAT( maxVec[1], maxValue, 1); _MM_EXTRACT_FLOAT( maxVec[2], maxValue, 2); return dp::math::Box3f( minVec, maxVec ); } else #elif defined(NEON) if ( useNEON ) { GroupBitSetSharedPtr groupImpl = std::static_pointer_cast<GroupBitSet>(group); float32x4_t minValue = vdupq_n_f32( std::numeric_limits<float>::max() ); float32x4_t maxValue = vdupq_n_f32( -std::numeric_limits<float>::max() ); char const* basePtr = reinterpret_cast<char const*>(groupImpl->getMatrices()); for ( size_t index = 0;index < groupImpl->getObjectCount(); ++index ) { const ObjectBitSetSharedPtr objectImpl = std::static_pointer_cast<ObjectBitSet>(groupImpl->getObject( index )); dp::math::neon::Mat44f const& modelView = *reinterpret_cast<dp::math::neon::Mat44f const*>(basePtr + objectImpl->getTransformIndex() * groupImpl->getMatricesStride()); dp::math::Vec4f const& extent = objectImpl->getExtent(); dp::math::neon::Vec4f vectors[8]; vectors[0] = *reinterpret_cast<dp::math::neon::Vec4f const*>(&objectImpl->getLowerLeft()) * modelView; dp::math::neon::Vec4f x( extent[0] * modelView[0] ); dp::math::neon::Vec4f y( extent[1] * modelView[1] ); dp::math::neon::Vec4f z( extent[2] * modelView[2] ); vectors[1] = vectors[0] + x; vectors[2] = vectors[0] + y; vectors[3] = vectors[1] + y; vectors[4] = vectors[0] + z; vectors[5] = vectors[1] + z; vectors[6] = vectors[2] + z; vectors[7] = vectors[3] + z; for ( unsigned int i = 0;i < 8; ++i ) { minValue = vminq_f32( minValue, vectors[i].neon() ); maxValue = vmaxq_f32( maxValue, vectors[i].neon() ); } } dp::math::Vec3f minVec, maxVec; vst1q_lane_f32( &minVec[0], minValue, 0); vst1q_lane_f32( &minVec[1], minValue, 1); vst1q_lane_f32( &minVec[2], minValue, 2); vst1q_lane_f32( &maxVec[0], maxValue, 0); vst1q_lane_f32( &maxVec[1], maxValue, 1); vst1q_lane_f32( &maxVec[2], maxValue, 2); return dp::math::Box3f( minVec, maxVec ); } else #endif // CPU fallback { GroupBitSetSharedPtr groupImpl = std::static_pointer_cast<GroupBitSet>(group); dp::math::Box4f boundingBox; char const* basePtr = reinterpret_cast<char const*>(groupImpl->getMatrices()); for ( size_t index = 0;index < groupImpl->getObjectCount(); ++index ) { const ObjectBitSetSharedPtr objectImpl = std::static_pointer_cast<ObjectBitSet>(groupImpl->getObject( index )); dp::math::Mat44f const& modelView = reinterpret_cast<dp::math::Mat44f const&>(*(basePtr + objectImpl->getTransformIndex() * groupImpl->getMatricesStride())); dp::math::Vec4f const& extent = objectImpl->getExtent(); dp::math::Vec4f vectors[8]; vectors[0] = (objectImpl->getLowerLeft() * modelView); dp::math::Vec4f x( extent[0] * modelView.getPtr()[0], extent[0] * modelView.getPtr()[1], extent[0] * modelView.getPtr()[2], extent[0] * modelView.getPtr()[3] ); dp::math::Vec4f y( extent[1] * modelView.getPtr()[4], extent[1] * modelView.getPtr()[5], extent[1] * modelView.getPtr()[6], extent[1] * modelView.getPtr()[7] ); dp::math::Vec4f z( extent[2] * modelView.getPtr()[8], extent[2] * modelView.getPtr()[9], extent[2] * modelView.getPtr()[10], extent[2] * modelView.getPtr()[11] ); vectors[1] = vectors[0] + x; vectors[2] = vectors[0] + y; vectors[3] = vectors[1] + y; vectors[4] = vectors[0] + z; vectors[5] = vectors[1] + z; vectors[6] = vectors[2] + z; vectors[7] = vectors[3] + z; for ( unsigned int i = 0;i < 8; ++i ) { boundingBox.update( vectors[i] ); } } dp::math::Vec4f lower = boundingBox.getLower(); dp::math::Vec4f upper = boundingBox.getUpper(); return dp::math::Box3f( dp::math::Vec3f( lower[0], lower[1], lower[2]), dp::math::Vec3f( upper[0], upper[1], upper[2])); } }
v4sf min_ps(v4sf a, v4sf b) { return vminq_f32(a, b); }
static void thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type ) { int i, j; Size roi = _src.size(); roi.width *= _src.channels(); const float* src = _src.ptr<float>(); float* dst = _dst.ptr<float>(); size_t src_step = _src.step/sizeof(src[0]); size_t dst_step = _dst.step/sizeof(dst[0]); #if CV_SSE2 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE); #endif if( _src.isContinuous() && _dst.isContinuous() ) { roi.width *= roi.height; roi.height = 1; } #ifdef HAVE_TEGRA_OPTIMIZATION if (tegra::thresh_32f(_src, _dst, roi.width, roi.height, thresh, maxval, type)) return; #endif #if defined(HAVE_IPP) CV_IPP_CHECK() { IppiSize sz = { roi.width, roi.height }; switch( type ) { case THRESH_TRUNC: if (0 <= ippiThreshold_GT_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh)) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); break; case THRESH_TOZERO: if (0 <= ippiThreshold_LTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh+FLT_EPSILON, 0)) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); break; case THRESH_TOZERO_INV: if (0 <= ippiThreshold_GTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0)) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); break; } } #endif switch( type ) { case THRESH_BINARY: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval); for( ; j <= roi.width - 8; j += 8 ) { __m128 v0, v1; v0 = _mm_loadu_ps( src + j ); v1 = _mm_loadu_ps( src + j + 4 ); v0 = _mm_cmpgt_ps( v0, thresh4 ); v1 = _mm_cmpgt_ps( v1, thresh4 ); v0 = _mm_and_ps( v0, maxval4 ); v1 = _mm_and_ps( v1, maxval4 ); _mm_storeu_ps( dst + j, v0 ); _mm_storeu_ps( dst + j + 4, v1 ); } } #elif CV_NEON float32x4_t v_thresh = vdupq_n_f32(thresh); uint32x4_t v_maxval = vreinterpretq_u32_f32(vdupq_n_f32(maxval)); for( ; j <= roi.width - 4; j += 4 ) { float32x4_t v_src = vld1q_f32(src + j); uint32x4_t v_dst = vandq_u32(vcgtq_f32(v_src, v_thresh), v_maxval); vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst)); } #endif for( ; j < roi.width; j++ ) dst[j] = src[j] > thresh ? maxval : 0; } break; case THRESH_BINARY_INV: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval); for( ; j <= roi.width - 8; j += 8 ) { __m128 v0, v1; v0 = _mm_loadu_ps( src + j ); v1 = _mm_loadu_ps( src + j + 4 ); v0 = _mm_cmple_ps( v0, thresh4 ); v1 = _mm_cmple_ps( v1, thresh4 ); v0 = _mm_and_ps( v0, maxval4 ); v1 = _mm_and_ps( v1, maxval4 ); _mm_storeu_ps( dst + j, v0 ); _mm_storeu_ps( dst + j + 4, v1 ); } } #elif CV_NEON float32x4_t v_thresh = vdupq_n_f32(thresh); uint32x4_t v_maxval = vreinterpretq_u32_f32(vdupq_n_f32(maxval)); for( ; j <= roi.width - 4; j += 4 ) { float32x4_t v_src = vld1q_f32(src + j); uint32x4_t v_dst = vandq_u32(vcleq_f32(v_src, v_thresh), v_maxval); vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst)); } #endif for( ; j < roi.width; j++ ) dst[j] = src[j] <= thresh ? maxval : 0; } break; case THRESH_TRUNC: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128 thresh4 = _mm_set1_ps(thresh); for( ; j <= roi.width - 8; j += 8 ) { __m128 v0, v1; v0 = _mm_loadu_ps( src + j ); v1 = _mm_loadu_ps( src + j + 4 ); v0 = _mm_min_ps( v0, thresh4 ); v1 = _mm_min_ps( v1, thresh4 ); _mm_storeu_ps( dst + j, v0 ); _mm_storeu_ps( dst + j + 4, v1 ); } } #elif CV_NEON float32x4_t v_thresh = vdupq_n_f32(thresh); for( ; j <= roi.width - 4; j += 4 ) vst1q_f32(dst + j, vminq_f32(vld1q_f32(src + j), v_thresh)); #endif for( ; j < roi.width; j++ ) dst[j] = std::min(src[j], thresh); } break; case THRESH_TOZERO: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128 thresh4 = _mm_set1_ps(thresh); for( ; j <= roi.width - 8; j += 8 ) { __m128 v0, v1; v0 = _mm_loadu_ps( src + j ); v1 = _mm_loadu_ps( src + j + 4 ); v0 = _mm_and_ps(v0, _mm_cmpgt_ps(v0, thresh4)); v1 = _mm_and_ps(v1, _mm_cmpgt_ps(v1, thresh4)); _mm_storeu_ps( dst + j, v0 ); _mm_storeu_ps( dst + j + 4, v1 ); } } #elif CV_NEON float32x4_t v_thresh = vdupq_n_f32(thresh); for( ; j <= roi.width - 4; j += 4 ) { float32x4_t v_src = vld1q_f32(src + j); uint32x4_t v_dst = vandq_u32(vcgtq_f32(v_src, v_thresh), vreinterpretq_u32_f32(v_src)); vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst)); } #endif for( ; j < roi.width; j++ ) { float v = src[j]; dst[j] = v > thresh ? v : 0; } } break; case THRESH_TOZERO_INV: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128 thresh4 = _mm_set1_ps(thresh); for( ; j <= roi.width - 8; j += 8 ) { __m128 v0, v1; v0 = _mm_loadu_ps( src + j ); v1 = _mm_loadu_ps( src + j + 4 ); v0 = _mm_and_ps(v0, _mm_cmple_ps(v0, thresh4)); v1 = _mm_and_ps(v1, _mm_cmple_ps(v1, thresh4)); _mm_storeu_ps( dst + j, v0 ); _mm_storeu_ps( dst + j + 4, v1 ); } } #elif CV_NEON float32x4_t v_thresh = vdupq_n_f32(thresh); for( ; j <= roi.width - 4; j += 4 ) { float32x4_t v_src = vld1q_f32(src + j); uint32x4_t v_dst = vandq_u32(vcleq_f32(v_src, v_thresh), vreinterpretq_u32_f32(v_src)); vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst)); } #endif for( ; j < roi.width; j++ ) { float v = src[j]; dst[j] = v <= thresh ? v : 0; } } break; default: return CV_Error( CV_StsBadArg, "" ); } }
static forcedinline ParallelType min (ParallelType a, ParallelType b) noexcept { return vminq_f32 (a, b); }
inline float32x4_t vminq(const float32x4_t & v0, const float32x4_t & v1) { return vminq_f32(v0, v1); }
float32x4_t test_vminq_f32 (float32x4_t __a, float32x4_t __b) { return vminq_f32(__a, __b); }
static float32x4_t vpowq_f32(float32x4_t a, float32x4_t b) { // a^b = exp2(b * log2(a)) // exp2(x) and log2(x) are calculated using polynomial approximations. float32x4_t log2_a, b_log2_a, a_exp_b; // Calculate log2(x), x = a. { // To calculate log2(x), we decompose x like this: // x = y * 2^n // n is an integer // y is in the [1.0, 2.0) range // // log2(x) = log2(y) + n // n can be evaluated by playing with float representation. // log2(y) in a small range can be approximated, this code uses an order // five polynomial approximation. The coefficients have been // estimated with the Remez algorithm and the resulting // polynomial has a maximum relative error of 0.00086%. // Compute n. // This is done by masking the exponent, shifting it into the top bit of // the mantissa, putting eight into the biased exponent (to shift/ // compensate the fact that the exponent has been shifted in the top/ // fractional part and finally getting rid of the implicit leading one // from the mantissa by substracting it out. const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000); const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000); const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000); const uint32x4_t two_n = vandq_u32(vreinterpretq_u32_f32(a), vec_float_exponent_mask); const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa); const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent); const float32x4_t n = vsubq_f32(vreinterpretq_f32_u32(n_0), vreinterpretq_f32_u32(vec_implicit_leading_one)); // Compute y. const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF); const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000); const uint32x4_t mantissa = vandq_u32(vreinterpretq_u32_f32(a), vec_mantissa_mask); const float32x4_t y = vreinterpretq_f32_u32(vorrq_u32(mantissa, vec_zero_biased_exponent_is_one)); // Approximate log2(y) ~= (y - 1) * pol5(y). // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f); const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f); const float32x4_t C3 = vdupq_n_f32(-1.2315303f); const float32x4_t C2 = vdupq_n_f32(2.5988452f); const float32x4_t C1 = vdupq_n_f32(-3.3241990f); const float32x4_t C0 = vdupq_n_f32(3.1157899f); float32x4_t pol5_y = C5; pol5_y = vmlaq_f32(C4, y, pol5_y); pol5_y = vmlaq_f32(C3, y, pol5_y); pol5_y = vmlaq_f32(C2, y, pol5_y); pol5_y = vmlaq_f32(C1, y, pol5_y); pol5_y = vmlaq_f32(C0, y, pol5_y); const float32x4_t y_minus_one = vsubq_f32(y, vreinterpretq_f32_u32(vec_zero_biased_exponent_is_one)); const float32x4_t log2_y = vmulq_f32(y_minus_one, pol5_y); // Combine parts. log2_a = vaddq_f32(n, log2_y); } // b * log2(a) b_log2_a = vmulq_f32(b, log2_a); // Calculate exp2(x), x = b * log2(a). { // To calculate 2^x, we decompose x like this: // x = n + y // n is an integer, the value of x - 0.5 rounded down, therefore // y is in the [0.5, 1.5) range // // 2^x = 2^n * 2^y // 2^n can be evaluated by playing with float representation. // 2^y in a small range can be approximated, this code uses an order two // polynomial approximation. The coefficients have been estimated // with the Remez algorithm and the resulting polynomial has a // maximum relative error of 0.17%. // To avoid over/underflow, we reduce the range of input to ]-127, 129]. const float32x4_t max_input = vdupq_n_f32(129.f); const float32x4_t min_input = vdupq_n_f32(-126.99999f); const float32x4_t x_min = vminq_f32(b_log2_a, max_input); const float32x4_t x_max = vmaxq_f32(x_min, min_input); // Compute n. const float32x4_t half = vdupq_n_f32(0.5f); const float32x4_t x_minus_half = vsubq_f32(x_max, half); const int32x4_t x_minus_half_floor = vcvtq_s32_f32(x_minus_half); // Compute 2^n. const int32x4_t float_exponent_bias = vdupq_n_s32(127); const int32x4_t two_n_exponent = vaddq_s32(x_minus_half_floor, float_exponent_bias); const float32x4_t two_n = vreinterpretq_f32_s32(vshlq_n_s32(two_n_exponent, kFloatExponentShift)); // Compute y. const float32x4_t y = vsubq_f32(x_max, vcvtq_f32_s32(x_minus_half_floor)); // Approximate 2^y ~= C2 * y^2 + C1 * y + C0. const float32x4_t C2 = vdupq_n_f32(3.3718944e-1f); const float32x4_t C1 = vdupq_n_f32(6.5763628e-1f); const float32x4_t C0 = vdupq_n_f32(1.0017247f); float32x4_t exp2_y = C2; exp2_y = vmlaq_f32(C1, y, exp2_y); exp2_y = vmlaq_f32(C0, y, exp2_y); // Combine parts. a_exp_b = vmulq_f32(exp2_y, two_n); } return a_exp_b; }