/* Calculates bounding rectagnle of a point set or retrieves already calculated */ CV_IMPL CvRect cvBoundingRect( CvArr* array, int update ) { CvSeqReader reader; CvRect rect = { 0, 0, 0, 0 }; CvContour contour_header; CvSeq* ptseq = 0; CvSeqBlock block; CvMat stub, *mat = 0; int xmin = 0, ymin = 0, xmax = -1, ymax = -1, i, j, k; int calculate = update; if( CV_IS_SEQ( array )) { ptseq = (CvSeq*)array; if( !CV_IS_SEQ_POINT_SET( ptseq )) CV_Error( CV_StsBadArg, "Unsupported sequence type" ); if( ptseq->header_size < (int)sizeof(CvContour)) { update = 0; calculate = 1; } } else { mat = cvGetMat( array, &stub ); if( CV_MAT_TYPE(mat->type) == CV_32SC2 || CV_MAT_TYPE(mat->type) == CV_32FC2 ) { ptseq = cvPointSeqFromMat(CV_SEQ_KIND_GENERIC, mat, &contour_header, &block); mat = 0; } else if( CV_MAT_TYPE(mat->type) != CV_8UC1 && CV_MAT_TYPE(mat->type) != CV_8SC1 ) CV_Error( CV_StsUnsupportedFormat, "The image/matrix format is not supported by the function" ); update = 0; calculate = 1; } if( !calculate ) return ((CvContour*)ptseq)->rect; if( mat ) { CvSize size = cvGetMatSize(mat); xmin = size.width; ymin = -1; for( i = 0; i < size.height; i++ ) { uchar* _ptr = mat->data.ptr + i*mat->step; uchar* ptr = (uchar*)cvAlignPtr(_ptr, 4); int have_nz = 0, k_min, offset = (int)(ptr - _ptr); j = 0; offset = MIN(offset, size.width); for( ; j < offset; j++ ) if( _ptr[j] ) { have_nz = 1; break; } if( j < offset ) { if( j < xmin ) xmin = j; if( j > xmax ) xmax = j; } if( offset < size.width ) { xmin -= offset; xmax -= offset; size.width -= offset; j = 0; for( ; j <= xmin - 4; j += 4 ) if( *((int*)(ptr+j)) ) break; for( ; j < xmin; j++ ) if( ptr[j] ) { xmin = j; if( j > xmax ) xmax = j; have_nz = 1; break; } k_min = MAX(j-1, xmax); k = size.width - 1; for( ; k > k_min && (k&3) != 3; k-- ) if( ptr[k] ) break; if( k > k_min && (k&3) == 3 ) { for( ; k > k_min+3; k -= 4 ) if( *((int*)(ptr+k-3)) ) break; } for( ; k > k_min; k-- ) if( ptr[k] ) { xmax = k; have_nz = 1; break; } if( !have_nz ) { j &= ~3; for( ; j <= k - 3; j += 4 ) if( *((int*)(ptr+j)) ) break; for( ; j <= k; j++ ) if( ptr[j] ) { have_nz = 1; break; } } xmin += offset; xmax += offset; size.width += offset; } if( have_nz ) { if( ymin < 0 ) ymin = i; ymax = i; } } if( xmin >= size.width ) xmin = ymin = 0; } else if( ptseq->total ) { int is_float = CV_SEQ_ELTYPE(ptseq) == CV_32FC2; cvStartReadSeq( ptseq, &reader, 0 ); CvPoint pt; CV_READ_SEQ_ELEM( pt, reader ); #if CV_SSE4_2 if(cv::checkHardwareSupport(CV_CPU_SSE4_2)) { if( !is_float ) { __m128i minval, maxval; minval = maxval = _mm_loadl_epi64((const __m128i*)(&pt)); //min[0]=pt.x, min[1]=pt.y for( i = 1; i < ptseq->total; i++) { __m128i ptXY = _mm_loadl_epi64((const __m128i*)(reader.ptr)); CV_NEXT_SEQ_ELEM(sizeof(pt), reader); minval = _mm_min_epi32(ptXY, minval); maxval = _mm_max_epi32(ptXY, maxval); } xmin = _mm_cvtsi128_si32(minval); ymin = _mm_cvtsi128_si32(_mm_srli_si128(minval, 4)); xmax = _mm_cvtsi128_si32(maxval); ymax = _mm_cvtsi128_si32(_mm_srli_si128(maxval, 4)); } else { __m128 minvalf, maxvalf, z = _mm_setzero_ps(), ptXY = _mm_setzero_ps(); minvalf = maxvalf = _mm_loadl_pi(z, (const __m64*)(&pt)); for( i = 1; i < ptseq->total; i++ ) { ptXY = _mm_loadl_pi(ptXY, (const __m64*)reader.ptr); CV_NEXT_SEQ_ELEM(sizeof(pt), reader); minvalf = _mm_min_ps(minvalf, ptXY); maxvalf = _mm_max_ps(maxvalf, ptXY); } float xyminf[2], xymaxf[2]; _mm_storel_pi((__m64*)xyminf, minvalf); _mm_storel_pi((__m64*)xymaxf, maxvalf); xmin = cvFloor(xyminf[0]); ymin = cvFloor(xyminf[1]); xmax = cvFloor(xymaxf[0]); ymax = cvFloor(xymaxf[1]); } } else #endif { if( !is_float ) { xmin = xmax = pt.x; ymin = ymax = pt.y; for( i = 1; i < ptseq->total; i++ ) { CV_READ_SEQ_ELEM( pt, reader ); if( xmin > pt.x ) xmin = pt.x; if( xmax < pt.x ) xmax = pt.x; if( ymin > pt.y ) ymin = pt.y; if( ymax < pt.y ) ymax = pt.y; } } else { Cv32suf v; // init values xmin = xmax = CV_TOGGLE_FLT(pt.x); ymin = ymax = CV_TOGGLE_FLT(pt.y); for( i = 1; i < ptseq->total; i++ ) { CV_READ_SEQ_ELEM( pt, reader ); pt.x = CV_TOGGLE_FLT(pt.x); pt.y = CV_TOGGLE_FLT(pt.y); if( xmin > pt.x ) xmin = pt.x; if( xmax < pt.x ) xmax = pt.x; if( ymin > pt.y ) ymin = pt.y; if( ymax < pt.y ) ymax = pt.y; } v.i = CV_TOGGLE_FLT(xmin); xmin = cvFloor(v.f); v.i = CV_TOGGLE_FLT(ymin); ymin = cvFloor(v.f); // because right and bottom sides of the bounding rectangle are not inclusive // (note +1 in width and height calculation below), cvFloor is used here instead of cvCeil v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f); v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f); } } rect.x = xmin; rect.y = ymin; rect.width = xmax - xmin + 1; rect.height = ymax - ymin + 1; } if( update ) ((CvContour*)ptseq)->rect = rect; return rect; }
// Calculates bounding rectagnle of a point set or retrieves already calculated static Rect pointSetBoundingRect( const Mat& points ) { int npoints = points.checkVector(2); int depth = points.depth(); CV_Assert(npoints >= 0 && (depth == CV_32F || depth == CV_32S)); int xmin = 0, ymin = 0, xmax = -1, ymax = -1, i; bool is_float = depth == CV_32F; if( npoints == 0 ) return Rect(); const Point* pts = (const Point*)points.data; Point pt = pts[0]; #if CV_SSE4_2 if(cv::checkHardwareSupport(CV_CPU_SSE4_2)) { if( !is_float ) { __m128i minval, maxval; minval = maxval = _mm_loadl_epi64((const __m128i*)(&pt)); //min[0]=pt.x, min[1]=pt.y for( i = 1; i < npoints; i++ ) { __m128i ptXY = _mm_loadl_epi64((const __m128i*)&pts[i]); minval = _mm_min_epi32(ptXY, minval); maxval = _mm_max_epi32(ptXY, maxval); } xmin = _mm_cvtsi128_si32(minval); ymin = _mm_cvtsi128_si32(_mm_srli_si128(minval, 4)); xmax = _mm_cvtsi128_si32(maxval); ymax = _mm_cvtsi128_si32(_mm_srli_si128(maxval, 4)); } else { __m128 minvalf, maxvalf, z = _mm_setzero_ps(), ptXY = _mm_setzero_ps(); minvalf = maxvalf = _mm_loadl_pi(z, (const __m64*)(&pt)); for( i = 1; i < npoints; i++ ) { ptXY = _mm_loadl_pi(ptXY, (const __m64*)&pts[i]); minvalf = _mm_min_ps(minvalf, ptXY); maxvalf = _mm_max_ps(maxvalf, ptXY); } float xyminf[2], xymaxf[2]; _mm_storel_pi((__m64*)xyminf, minvalf); _mm_storel_pi((__m64*)xymaxf, maxvalf); xmin = cvFloor(xyminf[0]); ymin = cvFloor(xyminf[1]); xmax = cvFloor(xymaxf[0]); ymax = cvFloor(xymaxf[1]); } } else #endif { if( !is_float ) { xmin = xmax = pt.x; ymin = ymax = pt.y; for( i = 1; i < npoints; i++ ) { pt = pts[i]; if( xmin > pt.x ) xmin = pt.x; if( xmax < pt.x ) xmax = pt.x; if( ymin > pt.y ) ymin = pt.y; if( ymax < pt.y ) ymax = pt.y; } } else { Cv32suf v; // init values xmin = xmax = CV_TOGGLE_FLT(pt.x); ymin = ymax = CV_TOGGLE_FLT(pt.y); for( i = 1; i < npoints; i++ ) { pt = pts[i]; pt.x = CV_TOGGLE_FLT(pt.x); pt.y = CV_TOGGLE_FLT(pt.y); if( xmin > pt.x ) xmin = pt.x; if( xmax < pt.x ) xmax = pt.x; if( ymin > pt.y ) ymin = pt.y; if( ymax < pt.y ) ymax = pt.y; } v.i = CV_TOGGLE_FLT(xmin); xmin = cvFloor(v.f); v.i = CV_TOGGLE_FLT(ymin); ymin = cvFloor(v.f); // because right and bottom sides of the bounding rectangle are not inclusive // (note +1 in width and height calculation below), cvFloor is used here instead of cvCeil v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f); v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f); } } return Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1); }
void BrushToolEdit::drawInner(const QPoint &pt, float strength) { float fixedStrength = params.strength; strength *= fixedStrength; auto color = params.color; std::array<int, 3> colorParts = Terrain::expandColor(color); __m128 colorMM = _mm_setr_ps(colorParts[0], colorParts[1], colorParts[2], 0); SseRoundingModeScope roundingModeScope(_MM_ROUND_NEAREST); (void) roundingModeScope; switch (tool->type()) { case BrushType::Blur: drawBlur(pt, std::min(strength / 5.f, 4.f)); break; case BrushType::Smoothen: drawSmoothen(pt, std::min(strength / 5.f, 4.f)); break; case BrushType::Raise: case BrushType::Lower: if (tool->type() == BrushType::Lower) { fixedStrength = -fixedStrength; strength = -strength; } switch (params.pressureMode) { case BrushPressureMode::AirBrush: strength *= 3.f; drawRaiseLower(pt, [=](float ¤t, float before, float tip) { (void) before; current -= tip * strength; }); break; case BrushPressureMode::Constant: if (tool->type() == BrushType::Lower) { drawRaiseLower(pt, [=](float ¤t, float before, float tip) { current = Terrain::quantizeOne(std::max(current, before - tip * fixedStrength)); }); } else { drawRaiseLower(pt, [=](float ¤t, float before, float tip) { current = Terrain::quantizeOne(std::min(current, before - tip * fixedStrength)); }); } break; case BrushPressureMode::Adjustable: drawRaiseLower(pt, [=](float ¤t, float before, float tip) { current = Terrain::quantizeOne(before - tip * strength); }); break; } break; case BrushType::Paint: switch (params.pressureMode) { case BrushPressureMode::AirBrush: strength = 1.f - std::exp2(-strength); drawColor(pt, [=](quint32 ¤t, quint32 before, float tip) { (void) before; // convert current color to FP32 auto currentMM = _mm_castps_si128(_mm_load_ss(reinterpret_cast<float *>(¤t))); currentMM = _mm_unpacklo_epi8(currentMM, _mm_setzero_si128()); currentMM = _mm_unpacklo_epi16(currentMM, _mm_setzero_si128()); auto currentMF = _mm_cvtepi32_ps(currentMM); auto factor = _mm_set1_ps(tip * strength); // blend auto diff = _mm_sub_ps(colorMM, currentMF); diff = _mm_mul_ps(diff, factor); currentMF = _mm_add_ps(currentMF, diff); // convert to RGB32 currentMF = _mm_add_ps(currentMF, globalDitherSampler.getM128()); currentMM = _mm_cvttps_epi32(currentMF); currentMM = _mm_packs_epi32(currentMM, currentMM); currentMM = _mm_packus_epi16(currentMM, currentMM); _mm_store_ss(reinterpret_cast<float *>(¤t), _mm_castsi128_ps(currentMM)); }); break; case BrushPressureMode::Constant: fixedStrength *= 0.01f; drawColor(pt, [=](quint32 ¤t, quint32 before, float tip) { // convert current color to FP32 auto currentMM = _mm_castps_si128(_mm_load_ss(reinterpret_cast<float *>(¤t))); currentMM = _mm_unpacklo_epi8(currentMM, _mm_setzero_si128()); currentMM = _mm_unpacklo_epi16(currentMM, _mm_setzero_si128()); auto currentMF = _mm_cvtepi32_ps(currentMM); // convert before color to FP32 auto beforeMM = _mm_setr_epi32(before, 0, 0, 0); beforeMM = _mm_unpacklo_epi8(beforeMM, _mm_setzero_si128()); beforeMM = _mm_unpacklo_epi16(beforeMM, _mm_setzero_si128()); auto beforeMF = _mm_cvtepi32_ps(beforeMM); // beforeMM = _mm_add_ps(beforeMM, globalDitherSampler.getM128()); // use "before" image to which way of color change is possible, and // compute possible range of result color auto diff = _mm_sub_ps(colorMM, beforeMF); auto factor = _mm_set1_ps(tip * fixedStrength); auto adddiff = _mm_mul_ps(diff, factor); beforeMF = _mm_add_ps(beforeMF, adddiff); auto diffDir = _mm_cmpgt_ps(diff, _mm_setzero_ps()); // compute output image auto out1 = _mm_max_ps(currentMF, beforeMF); auto out2 = _mm_min_ps(currentMF, beforeMF); currentMF = _mm_or_ps(_mm_and_ps(diffDir, out1), _mm_andnot_ps(diffDir, out2)); // convert to RGB32 currentMF = _mm_add_ps(currentMF, globalDitherSampler.getM128()); currentMM = _mm_cvttps_epi32(currentMF); currentMM = _mm_packs_epi32(currentMM, currentMM); currentMM = _mm_packus_epi16(currentMM, currentMM); _mm_store_ss(reinterpret_cast<float *>(¤t), _mm_castsi128_ps(currentMM)); }); break; case BrushPressureMode::Adjustable: strength *= 0.01f; drawColor(pt, [=](quint32 ¤t, quint32 before, float tip) { // convert before color to FP32 auto beforeMM = _mm_setr_epi32(before, 0, 0, 0); beforeMM = _mm_unpacklo_epi8(beforeMM, _mm_setzero_si128()); beforeMM = _mm_unpacklo_epi16(beforeMM, _mm_setzero_si128()); auto beforeMF = _mm_cvtepi32_ps(beforeMM); // blend auto diff = _mm_sub_ps(colorMM, beforeMF); auto factor = _mm_set1_ps(tip * strength); diff = _mm_mul_ps(diff, factor); beforeMF = _mm_add_ps(beforeMF, diff); // convert to RGB32 beforeMF = _mm_add_ps(beforeMF, globalDitherSampler.getM128()); beforeMM = _mm_cvttps_epi32(beforeMF); beforeMM = _mm_packs_epi32(beforeMM, beforeMM); beforeMM = _mm_packus_epi16(beforeMM, beforeMM); _mm_store_ss(reinterpret_cast<float *>(¤t), _mm_castsi128_ps(beforeMM)); }); break; } break; } }
dp::math::Box3f ManagerBitSet::calculateBoundingBox( const GroupSharedPtr& group ) const { #if defined(SSE) if ( useSSE ) { GroupBitSetSharedPtr groupImpl = std::static_pointer_cast<GroupBitSet>(group); __m128 minValue = _mm_set1_ps( std::numeric_limits<float>::signaling_NaN() ); __m128 maxValue = _mm_set1_ps( std::numeric_limits<float>::signaling_NaN() ); char const* basePtr = reinterpret_cast<char const*>(groupImpl->getMatrices()); for ( size_t index = 0;index < groupImpl->getObjectCount(); ++index ) { ObjectBitSetSharedPtr objectImpl = std::static_pointer_cast<ObjectBitSet>(groupImpl->getObject( index )); dp::math::sse::Mat44f const& modelView = *reinterpret_cast<dp::math::sse::Mat44f const*>(basePtr + objectImpl->getTransformIndex() * groupImpl->getMatricesStride()); dp::math::Vec4f const& extent = objectImpl->getExtent(); dp::math::sse::Vec4f vectors[8]; vectors[0] = *reinterpret_cast<dp::math::sse::Vec4f const*>(&objectImpl->getLowerLeft()) * modelView; dp::math::sse::Vec4f x( extent[0] * modelView[0] ); dp::math::sse::Vec4f y( extent[1] * modelView[1] ); dp::math::sse::Vec4f z( extent[2] * modelView[2] ); vectors[1] = vectors[0] + x; vectors[2] = vectors[0] + y; vectors[3] = vectors[1] + y; vectors[4] = vectors[0] + z; vectors[5] = vectors[1] + z; vectors[6] = vectors[2] + z; vectors[7] = vectors[3] + z; for ( unsigned int i = 0;i < 8; ++i ) { minValue = _mm_min_ps( minValue, vectors[i].sse() ); maxValue = _mm_max_ps( maxValue, vectors[i].sse() ); } } dp::math::Vec3f minVec, maxVec; _MM_EXTRACT_FLOAT( minVec[0], minValue, 0); _MM_EXTRACT_FLOAT( minVec[1], minValue, 1); _MM_EXTRACT_FLOAT( minVec[2], minValue, 2); _MM_EXTRACT_FLOAT( maxVec[0], maxValue, 0); _MM_EXTRACT_FLOAT( maxVec[1], maxValue, 1); _MM_EXTRACT_FLOAT( maxVec[2], maxValue, 2); return dp::math::Box3f( minVec, maxVec ); } else #elif defined(NEON) if ( useNEON ) { GroupBitSetSharedPtr groupImpl = std::static_pointer_cast<GroupBitSet>(group); float32x4_t minValue = vdupq_n_f32( std::numeric_limits<float>::max() ); float32x4_t maxValue = vdupq_n_f32( -std::numeric_limits<float>::max() ); char const* basePtr = reinterpret_cast<char const*>(groupImpl->getMatrices()); for ( size_t index = 0;index < groupImpl->getObjectCount(); ++index ) { const ObjectBitSetSharedPtr objectImpl = std::static_pointer_cast<ObjectBitSet>(groupImpl->getObject( index )); dp::math::neon::Mat44f const& modelView = *reinterpret_cast<dp::math::neon::Mat44f const*>(basePtr + objectImpl->getTransformIndex() * groupImpl->getMatricesStride()); dp::math::Vec4f const& extent = objectImpl->getExtent(); dp::math::neon::Vec4f vectors[8]; vectors[0] = *reinterpret_cast<dp::math::neon::Vec4f const*>(&objectImpl->getLowerLeft()) * modelView; dp::math::neon::Vec4f x( extent[0] * modelView[0] ); dp::math::neon::Vec4f y( extent[1] * modelView[1] ); dp::math::neon::Vec4f z( extent[2] * modelView[2] ); vectors[1] = vectors[0] + x; vectors[2] = vectors[0] + y; vectors[3] = vectors[1] + y; vectors[4] = vectors[0] + z; vectors[5] = vectors[1] + z; vectors[6] = vectors[2] + z; vectors[7] = vectors[3] + z; for ( unsigned int i = 0;i < 8; ++i ) { minValue = vminq_f32( minValue, vectors[i].neon() ); maxValue = vmaxq_f32( maxValue, vectors[i].neon() ); } } dp::math::Vec3f minVec, maxVec; vst1q_lane_f32( &minVec[0], minValue, 0); vst1q_lane_f32( &minVec[1], minValue, 1); vst1q_lane_f32( &minVec[2], minValue, 2); vst1q_lane_f32( &maxVec[0], maxValue, 0); vst1q_lane_f32( &maxVec[1], maxValue, 1); vst1q_lane_f32( &maxVec[2], maxValue, 2); return dp::math::Box3f( minVec, maxVec ); } else #endif // CPU fallback { GroupBitSetSharedPtr groupImpl = std::static_pointer_cast<GroupBitSet>(group); dp::math::Box4f boundingBox; char const* basePtr = reinterpret_cast<char const*>(groupImpl->getMatrices()); for ( size_t index = 0;index < groupImpl->getObjectCount(); ++index ) { const ObjectBitSetSharedPtr objectImpl = std::static_pointer_cast<ObjectBitSet>(groupImpl->getObject( index )); dp::math::Mat44f const& modelView = reinterpret_cast<dp::math::Mat44f const&>(*(basePtr + objectImpl->getTransformIndex() * groupImpl->getMatricesStride())); dp::math::Vec4f const& extent = objectImpl->getExtent(); dp::math::Vec4f vectors[8]; vectors[0] = (objectImpl->getLowerLeft() * modelView); dp::math::Vec4f x( extent[0] * modelView.getPtr()[0], extent[0] * modelView.getPtr()[1], extent[0] * modelView.getPtr()[2], extent[0] * modelView.getPtr()[3] ); dp::math::Vec4f y( extent[1] * modelView.getPtr()[4], extent[1] * modelView.getPtr()[5], extent[1] * modelView.getPtr()[6], extent[1] * modelView.getPtr()[7] ); dp::math::Vec4f z( extent[2] * modelView.getPtr()[8], extent[2] * modelView.getPtr()[9], extent[2] * modelView.getPtr()[10], extent[2] * modelView.getPtr()[11] ); vectors[1] = vectors[0] + x; vectors[2] = vectors[0] + y; vectors[3] = vectors[1] + y; vectors[4] = vectors[0] + z; vectors[5] = vectors[1] + z; vectors[6] = vectors[2] + z; vectors[7] = vectors[3] + z; for ( unsigned int i = 0;i < 8; ++i ) { boundingBox.update( vectors[i] ); } } dp::math::Vec4f lower = boundingBox.getLower(); dp::math::Vec4f upper = boundingBox.getUpper(); return dp::math::Box3f( dp::math::Vec3f( lower[0], lower[1], lower[2]), dp::math::Vec3f( upper[0], upper[1], upper[2])); } }
void x86_sse_find_peaks(float *buf, unsigned nframes, float *min, float *max) { __m128 current_max, current_min, work; // Load max and min values into all four slots of the XMM registers current_min = _mm_set1_ps(*min); current_max = _mm_set1_ps(*max); // Work input until "buf" reaches 16 byte alignment while ( ((unsigned long)buf) % 16 != 0 && nframes > 0) { // Load the next float into the work buffer work = _mm_set1_ps(*buf); current_min = _mm_min_ps(current_min, work); current_max = _mm_max_ps(current_max, work); buf++; nframes--; } // use 64 byte prefetch for quadruple quads while (nframes >= 16) { __builtin_prefetch(buf+64,0,0); work = _mm_load_ps(buf); current_min = _mm_min_ps(current_min, work); current_max = _mm_max_ps(current_max, work); buf+=4; work = _mm_load_ps(buf); current_min = _mm_min_ps(current_min, work); current_max = _mm_max_ps(current_max, work); buf+=4; work = _mm_load_ps(buf); current_min = _mm_min_ps(current_min, work); current_max = _mm_max_ps(current_max, work); buf+=4; work = _mm_load_ps(buf); current_min = _mm_min_ps(current_min, work); current_max = _mm_max_ps(current_max, work); buf+=4; nframes-=16; } // work through aligned buffers while (nframes >= 4) { work = _mm_load_ps(buf); current_min = _mm_min_ps(current_min, work); current_max = _mm_max_ps(current_max, work); buf+=4; nframes-=4; } // work through the rest < 4 samples while ( nframes > 0) { // Load the next float into the work buffer work = _mm_set1_ps(*buf); current_min = _mm_min_ps(current_min, work); current_max = _mm_max_ps(current_max, work); buf++; nframes--; } // Find min & max value in current_max through shuffle tricks work = current_min; work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1)); work = _mm_min_ps (work, current_min); current_min = work; work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2)); work = _mm_min_ps (work, current_min); _mm_store_ss(min, work); work = current_max; work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1)); work = _mm_max_ps (work, current_max); current_max = work; work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2)); work = _mm_max_ps (work, current_max); _mm_store_ss(max, work); }
void LightDesc_t::ComputeLightAtPoints( const FourVectors &pos, const FourVectors &normal, FourVectors &color, bool DoHalfLambert ) const { FourVectors delta; Assert((m_Type==MATERIAL_LIGHT_POINT) || (m_Type==MATERIAL_LIGHT_SPOT) || (m_Type==MATERIAL_LIGHT_DIRECTIONAL)); switch (m_Type) { case MATERIAL_LIGHT_POINT: case MATERIAL_LIGHT_SPOT: delta.DuplicateVector(m_Position); delta-=pos; break; case MATERIAL_LIGHT_DIRECTIONAL: delta.DuplicateVector(m_Direction); delta*=-1.0; break; default: delta.x = Four_Zeros; delta.y = Four_Zeros; delta.z = Four_Zeros; break; } __m128 dist2 = delta*delta; __m128 falloff; if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION0 ) { falloff = MMReplicate(m_Attenuation0); } else falloff= Four_Epsilons; if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION1 ) { falloff=_mm_add_ps(falloff,_mm_mul_ps(MMReplicate(m_Attenuation1),_mm_sqrt_ps(dist2))); } if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION2 ) { falloff=_mm_add_ps(falloff,_mm_mul_ps(MMReplicate(m_Attenuation2),dist2)); } falloff=_mm_rcp_ps(falloff); // Cull out light beyond this radius // now, zero out elements for which dist2 was > range^2. !!speed!! lights should store dist^2 in sse format if (m_Range != 0.f) { __m128 RangeSquared=MMReplicate(m_RangeSquared); // !!speed!! falloff=_mm_and_ps(falloff,_mm_cmplt_ps(dist2,RangeSquared)); } delta.VectorNormalizeFast(); __m128 strength=delta*normal; if (DoHalfLambert) { strength=_mm_add_ps(_mm_mul_ps(strength,Four_PointFives),Four_PointFives); } else strength=_mm_max_ps(Four_Zeros,delta*normal); switch(m_Type) { case MATERIAL_LIGHT_POINT: // half-lambert break; case MATERIAL_LIGHT_SPOT: { __m128 dot2=_mm_sub_ps(Four_Zeros,delta*m_Direction); // dot position with spot light dir for cone falloff __m128 cone_falloff_scale=_mm_mul_ps(MMReplicate(OneOver_ThetaDot_Minus_PhiDot), _mm_sub_ps(dot2,MMReplicate(m_PhiDot))); cone_falloff_scale=_mm_min_ps(cone_falloff_scale,Four_Ones); if ((m_Falloff!=0.0) && (m_Falloff!=1.0)) { // !!speed!! could compute integer exponent needed by powsse and store in light cone_falloff_scale=PowSSE(cone_falloff_scale,m_Falloff); } strength=_mm_mul_ps(cone_falloff_scale,strength); // now, zero out lighting where dot2<phidot. This will mask out any invalid results // from pow function, etc __m128 OutsideMask=_mm_cmpgt_ps(dot2,MMReplicate(m_PhiDot)); // outside light cone? strength=_mm_and_ps(OutsideMask,strength); } break; case MATERIAL_LIGHT_DIRECTIONAL: break; default: break; } strength=_mm_mul_ps(strength,falloff); color.x=_mm_add_ps(color.x,_mm_mul_ps(strength,MMReplicate(m_Color.x))); color.y=_mm_add_ps(color.y,_mm_mul_ps(strength,MMReplicate(m_Color.y))); color.z=_mm_add_ps(color.z,_mm_mul_ps(strength,MMReplicate(m_Color.z))); }
static long conv_rgba16_rgbAF (const uint16_t *src, float *dst, long samples) { long i = 0; long remainder; if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0) { long n = (samples / 2) * 2; const __m128i *s = (const __m128i*) src; __v4sf *d = (__v4sf*) dst; const __v4sf max_mask = { 0.0f, 0.0f, 0.0f, 1.0f }; for (; i < n / 2; i++) { /* Expand shorts to ints by loading zero in the high bits */ const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], (__m128i)_mm_setzero_ps()); const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], (__m128i)_mm_setzero_ps()); /* Convert to float */ const __m128 u0 = _mm_cvtepi32_ps (t0); const __m128 u1 = _mm_cvtepi32_ps (t1); /* Multiply by 1 / 65535 */ __v4sf rgba0 = u0 * u16_float; __v4sf rgba1 = u1 * u16_float; /* Expand alpha */ __v4sf aaaa0 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba0, _MM_SHUFFLE(3, 3, 3, 3)); __v4sf aaaa1 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba1, _MM_SHUFFLE(3, 3, 3, 3)); /* Set the value in the alpha slot to 1.0, we know max is sufficent because alpha was a short */ aaaa0 = _mm_max_ps(aaaa0, max_mask); aaaa1 = _mm_max_ps(aaaa1, max_mask); /* Premultiply */ rgba0 = rgba0 * aaaa0; rgba1 = rgba1 * aaaa1; d[2 * i + 0] = rgba0; d[2 * i + 1] = rgba1; } _mm_empty(); } dst += i * 2 * 4; src += i * 2 * 4; remainder = samples - (i * 2); while (remainder--) { const float a = src[3] / 65535.0f; const float a_term = a / 65535.0f; dst[0] = src[0] * a_term; dst[1] = src[1] * a_term; dst[2] = src[2] * a_term; dst[3] = a; src += 4; dst += 4; } return samples; }
void transform8_otherrgb_avx(ThreadInfo* t) { RS_IMAGE16 *input = t->input; GdkPixbuf *output = t->output; RS_MATRIX3 *matrix = t->matrix; gint x,y; gint width; float mat_ps[4*4*3] __attribute__ ((aligned (16))); for (x = 0; x < 4; x++ ) { mat_ps[x] = matrix->coeff[0][0]; mat_ps[x+4] = matrix->coeff[0][1]; mat_ps[x+8] = matrix->coeff[0][2]; mat_ps[12+x] = matrix->coeff[1][0]; mat_ps[12+x+4] = matrix->coeff[1][1]; mat_ps[12+x+8] = matrix->coeff[1][2]; mat_ps[24+x] = matrix->coeff[2][0]; mat_ps[24+x+4] = matrix->coeff[2][1]; mat_ps[24+x+8] = matrix->coeff[2][2]; } int start_x = t->start_x; /* Always have aligned input and output adress */ if (start_x & 3) start_x = ((start_x) / 4) * 4; int complete_w = t->end_x - start_x; /* If width is not multiple of 4, check if we can extend it a bit */ if (complete_w & 3) { if ((t->end_x+4) < input->w) complete_w = ((complete_w+3) / 4 * 4); } __m128 gamma = _mm_set1_ps(t->output_gamma); for(y=t->start_y ; y<t->end_y ; y++) { gushort *i = GET_PIXEL(input, start_x, y); guchar *o = GET_PIXBUF_PIXEL(output, start_x, y); gboolean aligned_write = !((guintptr)(o)&0xf); width = complete_w >> 2; while(width--) { /* Load and convert to float */ __m128i zero = _mm_setzero_si128(); __m128i in = _mm_load_si128((__m128i*)i); // Load two pixels __m128i in2 = _mm_load_si128((__m128i*)i+1); // Load two pixels _mm_prefetch(i + 64, _MM_HINT_NTA); __m128i p1 =_mm_unpacklo_epi16(in, zero); __m128i p2 =_mm_unpackhi_epi16(in, zero); __m128i p3 =_mm_unpacklo_epi16(in2, zero); __m128i p4 =_mm_unpackhi_epi16(in2, zero); __m128 p1f = _mm_cvtepi32_ps(p1); __m128 p2f = _mm_cvtepi32_ps(p2); __m128 p3f = _mm_cvtepi32_ps(p3); __m128 p4f = _mm_cvtepi32_ps(p4); /* Convert to planar */ __m128 g1g0r1r0 = _mm_unpacklo_ps(p1f, p2f); __m128 b1b0 = _mm_unpackhi_ps(p1f, p2f); __m128 g3g2r3r2 = _mm_unpacklo_ps(p3f, p4f); __m128 b3b2 = _mm_unpackhi_ps(p3f, p4f); __m128 r = _mm_movelh_ps(g1g0r1r0, g3g2r3r2); __m128 g = _mm_movehl_ps(g3g2r3r2, g1g0r1r0); __m128 b = _mm_movelh_ps(b1b0, b3b2); /* Apply matrix to convert to sRGB */ __m128 r2 = sse_matrix3_mul(mat_ps, r, g, b); __m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b); __m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b); /* Normalize to 0->1 and clamp */ __m128 normalize = _mm_load_ps(_normalize); __m128 max_val = _mm_load_ps(_ones_ps); __m128 min_val = _mm_setzero_ps(); r = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, r2))); g = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, g2))); b = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, b2))); /* Apply Gamma */ __m128 upscale = _mm_load_ps(_8bit); r = _mm_mul_ps(upscale, _mm_fastpow_ps(r, gamma)); g = _mm_mul_ps(upscale, _mm_fastpow_ps(g, gamma)); b = _mm_mul_ps(upscale, _mm_fastpow_ps(b, gamma)); /* Convert to 8 bit unsigned and interleave*/ __m128i r_i = _mm_cvtps_epi32(r); __m128i g_i = _mm_cvtps_epi32(g); __m128i b_i = _mm_cvtps_epi32(b); r_i = _mm_packs_epi32(r_i, r_i); g_i = _mm_packs_epi32(g_i, g_i); b_i = _mm_packs_epi32(b_i, b_i); /* Set alpha value to 255 and store */ __m128i alpha_mask = _mm_load_si128((__m128i*)_alpha_mask); __m128i rg_i = _mm_unpacklo_epi16(r_i, g_i); __m128i bb_i = _mm_unpacklo_epi16(b_i, b_i); p1 = _mm_unpacklo_epi32(rg_i, bb_i); p2 = _mm_unpackhi_epi32(rg_i, bb_i); p1 = _mm_or_si128(alpha_mask, _mm_packus_epi16(p1, p2)); if (aligned_write) _mm_store_si128((__m128i*)o, p1); else _mm_storeu_si128((__m128i*)o, p1); i += 16; o += 16; } /* Process remaining pixels */ width = complete_w & 3; while(width--) { __m128i zero = _mm_setzero_si128(); __m128i in = _mm_loadl_epi64((__m128i*)i); // Load two pixels __m128i p1 =_mm_unpacklo_epi16(in, zero); __m128 p1f = _mm_cvtepi32_ps(p1); /* Splat r,g,b */ __m128 r = _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(0,0,0,0)); __m128 g = _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(1,1,1,1)); __m128 b = _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(2,2,2,2)); __m128 r2 = sse_matrix3_mul(mat_ps, r, g, b); __m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b); __m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b); r = _mm_unpacklo_ps(r2, g2); // GG RR GG RR r = _mm_movelh_ps(r, b2); // BB BB GG RR __m128 normalize = _mm_load_ps(_normalize); __m128 max_val = _mm_load_ps(_ones_ps); __m128 min_val = _mm_setzero_ps(); r = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, r))); __m128 upscale = _mm_load_ps(_8bit); r = _mm_mul_ps(upscale, _mm_fastpow_ps(r, gamma)); /* Convert to 8 bit unsigned */ zero = _mm_setzero_si128(); __m128i r_i = _mm_cvtps_epi32(r); /* To 16 bit signed */ r_i = _mm_packs_epi32(r_i, zero); /* To 8 bit unsigned - set alpha channel*/ __m128i alpha_mask = _mm_load_si128((__m128i*)_alpha_mask); r_i = _mm_or_si128(alpha_mask, _mm_packus_epi16(r_i, zero)); *(int*)o = _mm_cvtsi128_si32(r_i); i+=4; o+=4; } } }
static forcedinline ParallelType max (ParallelType a, ParallelType b) noexcept { return _mm_max_ps (a, b); }
void MixAudio(float *bufferDest, float *bufferSrc, UINT totalFloats, bool bForceMono) { UINT floatsLeft = totalFloats; float *destTemp = bufferDest; float *srcTemp = bufferSrc; if((UPARAM(destTemp) & 0xF) == 0 && (UPARAM(srcTemp) & 0xF) == 0) { UINT alignedFloats = floatsLeft & 0xFFFFFFFC; if(bForceMono) { __m128 halfVal = _mm_set_ps1(0.5f); for(UINT i=0; i<alignedFloats; i += 4) { float *micInput = srcTemp+i; __m128 val = _mm_load_ps(micInput); __m128 shufVal = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1)); _mm_store_ps(micInput, _mm_mul_ps(_mm_add_ps(val, shufVal), halfVal)); } } __m128 maxVal = _mm_set_ps1(1.0f); __m128 minVal = _mm_set_ps1(-1.0f); for(UINT i=0; i<alignedFloats; i += 4) { float *pos = destTemp+i; __m128 mix; mix = _mm_add_ps(_mm_load_ps(pos), _mm_load_ps(srcTemp+i)); mix = _mm_min_ps(mix, maxVal); mix = _mm_max_ps(mix, minVal); _mm_store_ps(pos, mix); } floatsLeft &= 0x3; destTemp += alignedFloats; srcTemp += alignedFloats; } if(floatsLeft) { if(bForceMono) { for(UINT i=0; i<floatsLeft; i += 2) { srcTemp[i] += srcTemp[i+1]; srcTemp[i] *= 0.5f; srcTemp[i+1] = srcTemp[i]; } } for(UINT i=0; i<floatsLeft; i++) { float val = destTemp[i]+srcTemp[i]; if(val < -1.0f) val = -1.0f; else if(val > 1.0f) val = 1.0f; destTemp[i] = val; } } }
static void GF_FUNC_ALIGN VS_CC proc_16bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *d, const uint8_t *s, edge_t *eh, uint16_t plane_max) { const uint16_t *srcp = (uint16_t *)s; uint16_t *dstp = (uint16_t *)d; stride /= 2; bstride /= 2; uint16_t* p0 = (uint16_t *)buff + 8; uint16_t* p1 = p0 + bstride; uint16_t* p2 = p1 + bstride; uint16_t* p3 = p2 + bstride; uint16_t* p4 = p3 + bstride; uint16_t *orig = p0, *end = p4; line_copy16(p0, srcp + 2 * stride, width, 2); line_copy16(p1, srcp + stride, width, 2); line_copy16(p2, srcp, width, 2); srcp += stride; line_copy16(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128 alpha = _mm_set1_ps((float)0.96043387); __m128 beta = _mm_set1_ps((float)0.39782473); __m128i pmax = _mm_set1_epi32(0xFFFF); __m128i min = _mm_set1_epi16((int16_t)eh->min); __m128i max = _mm_set1_epi16((int16_t)eh->max); for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy16(p4, srcp, width, 2); uint16_t* posh[] = {p2 - 2, p2 - 1, p2 + 1, p2 + 2}; uint16_t* posv[] = {p0, p1, p3, p4}; for (int x = 0; x < width; x += 8) { __m128 sumx[2] = {(__m128)zero, (__m128)zero}; __m128 sumy[2] = {(__m128)zero, (__m128)zero}; for (int i = 0; i < 4; i++) { __m128 xmul = _mm_load_ps(ar_mulxf[i]); __m128i xmm0 = _mm_loadu_si128((__m128i *)(posh[i] + x)); __m128i xmm1 = _mm_unpackhi_epi16(xmm0, zero); xmm0 = _mm_unpacklo_epi16(xmm0, zero); sumx[0] = _mm_add_ps(sumx[0], _mm_mul_ps(_mm_cvtepi32_ps(xmm0), xmul)); sumx[1] = _mm_add_ps(sumx[1], _mm_mul_ps(_mm_cvtepi32_ps(xmm1), xmul)); xmul = _mm_load_ps(ar_mulyf[i]); xmm0 = _mm_load_si128((__m128i *)(posv[i] + x)); xmm1 = _mm_unpackhi_epi16(xmm0, zero); xmm0 = _mm_unpacklo_epi16(xmm0, zero); sumy[0] = _mm_add_ps(sumy[0], _mm_mul_ps(_mm_cvtepi32_ps(xmm0), xmul)); sumy[1] = _mm_add_ps(sumy[1], _mm_mul_ps(_mm_cvtepi32_ps(xmm1), xmul)); } __m128i out[2]; for (int i = 0; i < 2; i++) { sumx[i] = mm_abs_ps(sumx[i]); sumy[i] = mm_abs_ps(sumy[i]); __m128 t0 = _mm_max_ps(sumx[i], sumy[i]); __m128 t1 = _mm_min_ps(sumx[i], sumy[i]); t0 = _mm_add_ps(_mm_mul_ps(alpha, t0), _mm_mul_ps(beta, t1)); out[i] = _mm_srli_epi32(_mm_cvtps_epi32(t0), eh->rshift); out[i] = mm_min_epi32(out[i], pmax); } out[0] = mm_cast_epi32(out[0], out[1]); out[1] = MM_MIN_EPU16(out[0], max); out[1] = _mm_cmpeq_epi16(out[1], max); out[0] = _mm_or_si128(out[1], out[0]); out[1] = MM_MAX_EPU16(out[0], min); out[1] = _mm_cmpeq_epi16(out[1], min); out[0] = _mm_andnot_si128(out[1], out[0]); _mm_store_si128((__m128i *)(dstp + x), out[0]); } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
static gboolean draw(GtkWidget *widget, cairo_t *cr, dt_iop_module_t *self) { if(darktable.gui->reset) return FALSE; if(self->picked_color_max[0] < 0.0f) return FALSE; if(self->request_color_pick == DT_REQUEST_COLORPICK_OFF) return FALSE; dt_iop_invert_gui_data_t *g = (dt_iop_invert_gui_data_t *)self->gui_data; dt_iop_invert_params_t *p = (dt_iop_invert_params_t *)self->params; if(fabsf(p->color[0] - self->picked_color[0]) < 0.0001f && fabsf(p->color[1] - self->picked_color[1]) < 0.0001f && fabsf(p->color[2] - self->picked_color[2]) < 0.0001f) { // interrupt infinite loops return FALSE; } p->color[0] = self->picked_color[0]; p->color[1] = self->picked_color[1]; p->color[2] = self->picked_color[2]; GdkRGBA color = (GdkRGBA){.red = p->color[0], .green = p->color[1], .blue = p->color[2], .alpha = 1.0 }; gtk_color_chooser_set_rgba(GTK_COLOR_CHOOSER(g->colorpicker), &color); dt_dev_add_history_item(darktable.develop, self, TRUE); return FALSE; } static void colorpicker_callback(GtkColorButton *widget, dt_iop_module_t *self) { if(self->dt->gui->reset) return; dt_iop_invert_gui_data_t *g = (dt_iop_invert_gui_data_t *)self->gui_data; dt_iop_invert_params_t *p = (dt_iop_invert_params_t *)self->params; // turn off the other color picker so that this tool actually works ... gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(g->picker), FALSE); GdkRGBA c; gtk_color_chooser_get_rgba(GTK_COLOR_CHOOSER(widget), &c); p->color[0] = c.red; p->color[1] = c.green; p->color[2] = c.blue; dt_dev_add_history_item(darktable.develop, self, TRUE); } void process(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out) { dt_iop_invert_data_t *d = (dt_iop_invert_data_t *)piece->data; const float *const m = piece->pipe->processed_maximum; float film_rgb[4] = { d->color[0], d->color[1], d->color[2], 0.0f }; // Convert the RGB color to CYGM only if we're not in the preview pipe (which is already RGB) if((self->dev->image_storage.flags & DT_IMAGE_4BAYER) && !dt_dev_pixelpipe_uses_downsampled_input(piece->pipe)) dt_colorspaces_rgb_to_cygm(film_rgb, 1, d->RGB_to_CAM); const float film_rgb_f[4] = { film_rgb[0] * m[0], film_rgb[1] * m[1], film_rgb[2] * m[2], film_rgb[3] * m[3] }; // FIXME: it could be wise to make this a NOP when picking colors. not sure about that though. // if(self->request_color_pick){ // do nothing // } const int filters = dt_image_filter(&piece->pipe->image); const uint8_t (*const xtrans)[6] = (const uint8_t (*const)[6]) self->dev->image_storage.xtrans; if(!dt_dev_pixelpipe_uses_downsampled_input(piece->pipe) && (filters == 9u)) { // xtrans float mosaiced #ifdef _OPENMP #pragma omp parallel for default(none) shared(roi_out, ivoid, ovoid) schedule(static) #endif for(int j = 0; j < roi_out->height; j++) { const float *in = ((float *)ivoid) + (size_t)j * roi_out->width; float *out = ((float *)ovoid) + (size_t)j * roi_out->width; for(int i = 0; i < roi_out->width; i++, out++, in++) *out = CLAMP(film_rgb_f[FCxtrans(j, i, roi_out, xtrans)] - *in, 0.0f, 1.0f); } for(int k = 0; k < 4; k++) piece->pipe->processed_maximum[k] = 1.0f; } else if(!dt_dev_pixelpipe_uses_downsampled_input(piece->pipe) && filters) { // bayer float mosaiced const __m128 val_min = _mm_setzero_ps(); const __m128 val_max = _mm_set1_ps(1.0f); #ifdef _OPENMP #pragma omp parallel for default(none) shared(roi_out, ivoid, ovoid) schedule(static) #endif for(int j = 0; j < roi_out->height; j++) { const float *in = ((float *)ivoid) + (size_t)j * roi_out->width; float *out = ((float *)ovoid) + (size_t)j * roi_out->width; int i = 0; int alignment = ((4 - (j * roi_out->width & (4 - 1))) & (4 - 1)); // process unaligned pixels for(; i < alignment; i++, out++, in++) *out = CLAMP(film_rgb_f[FC(j + roi_out->y, i + roi_out->x, filters)] - *in, 0.0f, 1.0f); const __m128 film = _mm_set_ps(film_rgb_f[FC(j + roi_out->y, roi_out->x + i + 3, filters)], film_rgb_f[FC(j + roi_out->y, roi_out->x + i + 2, filters)], film_rgb_f[FC(j + roi_out->y, roi_out->x + i + 1, filters)], film_rgb_f[FC(j + roi_out->y, roi_out->x + i, filters)]); // process aligned pixels with SSE for(; i < roi_out->width - (4 - 1); i += 4, in += 4, out += 4) { const __m128 input = _mm_load_ps(in); const __m128 subtracted = _mm_sub_ps(film, input); _mm_stream_ps(out, _mm_max_ps(_mm_min_ps(subtracted, val_max), val_min)); } // process the rest for(; i < roi_out->width; i++, out++, in++) *out = CLAMP(film_rgb_f[FC(j + roi_out->y, i + roi_out->x, filters)] - *in, 0.0f, 1.0f); } _mm_sfence(); for(int k = 0; k < 4; k++) piece->pipe->processed_maximum[k] = 1.0f; } else { // non-mosaiced const int ch = piece->colors; const __m128 film = _mm_set_ps(1.0f, film_rgb[2], film_rgb[1], film_rgb[0]); #ifdef _OPENMP #pragma omp parallel for default(none) shared(roi_out, ivoid, ovoid) schedule(static) #endif for(int k = 0; k < roi_out->height; k++) { const float *in = ((float *)ivoid) + (size_t)ch * k * roi_out->width; float *out = ((float *)ovoid) + (size_t)ch * k * roi_out->width; for(int j = 0; j < roi_out->width; j++, in += ch, out += ch) { const __m128 input = _mm_load_ps(in); const __m128 subtracted = _mm_sub_ps(film, input); _mm_stream_ps(out, subtracted); } } _mm_sfence(); if(piece->pipe->mask_display) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height); } }
test (__m128 s1, __m128 s2) { return _mm_max_ps (s1, s2); }
std::unique_ptr<Occluder> Occluder::bake(const std::vector<__m128>& vertices, __m128 refMin, __m128 refMax) { assert(vertices.size() % 16 == 0); // Simple k-means clustering by normal direction to improve backface culling efficiency std::vector<__m128> quadNormals; for (auto i = 0; i < vertices.size(); i += 4) { auto v0 = vertices[i + 0]; auto v1 = vertices[i + 1]; auto v2 = vertices[i + 2]; auto v3 = vertices[i + 3]; quadNormals.push_back(normalize(_mm_add_ps(normal(v0, v1, v2), normal(v0, v2, v3)))); } std::vector<__m128> centroids; std::vector<uint32_t> centroidAssignment; centroids.push_back(_mm_setr_ps(+1.0f, 0.0f, 0.0f, 0.0f)); centroids.push_back(_mm_setr_ps(0.0f, +1.0f, 0.0f, 0.0f)); centroids.push_back(_mm_setr_ps(0.0f, 0.0f, +1.0f, 0.0f)); centroids.push_back(_mm_setr_ps(0.0f, -1.0f, 0.0f, 0.0f)); centroids.push_back(_mm_setr_ps(0.0f, 0.0f, -1.0f, 0.0f)); centroids.push_back(_mm_setr_ps(-1.0f, 0.0f, 0.0f, 0.0f)); centroidAssignment.resize(vertices.size() / 4); bool anyChanged = true; for (int iter = 0; iter < 10 && anyChanged; ++iter) { anyChanged = false; for (auto j = 0; j < quadNormals.size(); ++j) { __m128 normal = quadNormals[j]; __m128 bestDistance = _mm_set1_ps(-std::numeric_limits<float>::infinity()); int bestCentroid = -1; for (int k = 0; k < centroids.size(); ++k) { __m128 distance = _mm_dp_ps(centroids[k], normal, 0x7F); if (_mm_comige_ss(distance, bestDistance)) { bestDistance = distance; bestCentroid = k; } } if (centroidAssignment[j] != bestCentroid) { centroidAssignment[j] = bestCentroid; anyChanged = true; } } for (int k = 0; k < centroids.size(); ++k) { centroids[k] = _mm_setzero_ps(); } for (int j = 0; j < quadNormals.size(); ++j) { int k = centroidAssignment[j]; centroids[k] = _mm_add_ps(centroids[k], quadNormals[j]); } for (int k = 0; k < centroids.size(); ++k) { centroids[k] = normalize(centroids[k]); } } std::vector<__m128> orderedVertices; for (int k = 0; k < centroids.size(); ++k) { for (int j = 0; j < vertices.size() / 4; ++j) { if (centroidAssignment[j] == k) { orderedVertices.push_back(vertices[4 * j + 0]); orderedVertices.push_back(vertices[4 * j + 1]); orderedVertices.push_back(vertices[4 * j + 2]); orderedVertices.push_back(vertices[4 * j + 3]); } } } auto occluder = std::make_unique<Occluder>(); __m128 invExtents = _mm_div_ps(_mm_set1_ps(1.0f), _mm_sub_ps(refMax, refMin)); __m128 scalingX = _mm_set1_ps(2047.0f); __m128 scalingY = _mm_set1_ps(2047.0f); __m128 scalingZ = _mm_set1_ps(1023.0f); __m128 half = _mm_set1_ps(0.5f); for (size_t i = 0; i < orderedVertices.size(); i += 16) { for (auto j = 0; j < 4; ++j) { // Transform into [0,1] space relative to bounding box __m128 v0 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 0], refMin), invExtents); __m128 v1 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 4], refMin), invExtents); __m128 v2 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 8], refMin), invExtents); __m128 v3 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 12], refMin), invExtents); // Transpose into [xxxx][yyyy][zzzz][wwww] _MM_TRANSPOSE4_PS(v0, v1, v2, v3); // Scale and truncate to int v0 = _mm_fmadd_ps(v0, scalingX, half); v1 = _mm_fmadd_ps(v1, scalingY, half); v2 = _mm_fmadd_ps(v2, scalingZ, half); __m128i X = _mm_cvttps_epi32(v0); __m128i Y = _mm_cvttps_epi32(v1); __m128i Z = _mm_cvttps_epi32(v2); // Pack to 11/11/10 format __m128i XYZ = _mm_or_si128(_mm_slli_epi32(X, 21), _mm_or_si128(_mm_slli_epi32(Y, 10), Z)); occluder->m_vertexData.push_back(XYZ); } } occluder->m_refMin = refMin; occluder->m_refMax = refMax; __m128 min = _mm_set1_ps(+std::numeric_limits<float>::infinity()); __m128 max = _mm_set1_ps(-std::numeric_limits<float>::infinity()); for (size_t i = 0; i < orderedVertices.size(); ++i) { min = _mm_min_ps(vertices[i], min); max = _mm_max_ps(vertices[i], max); } // Set W = 1 - this is expected by frustum culling code min = _mm_blend_ps(min, _mm_set1_ps(1.0f), 0b1000); max = _mm_blend_ps(max, _mm_set1_ps(1.0f), 0b1000); occluder->m_boundsMin = min; occluder->m_boundsMax = max; occluder->m_center = _mm_mul_ps(_mm_add_ps(max, min), _mm_set1_ps(0.5f)); return occluder; }
static __m128 mm_pow_ps(__m128 a, __m128 b) { // a^b = exp2(b * log2(a)) // exp2(x) and log2(x) are calculated using polynomial approximations. __m128 log2_a, b_log2_a, a_exp_b; // Calculate log2(x), x = a. { // To calculate log2(x), we decompose x like this: // x = y * 2^n // n is an integer // y is in the [1.0, 2.0) range // // log2(x) = log2(y) + n // n can be evaluated by playing with float representation. // log2(y) in a small range can be approximated, this code uses an order // five polynomial approximation. The coefficients have been // estimated with the Remez algorithm and the resulting // polynomial has a maximum relative error of 0.00086%. // Compute n. // This is done by masking the exponent, shifting it into the top bit of // the mantissa, putting eight into the biased exponent (to shift/ // compensate the fact that the exponent has been shifted in the top/ // fractional part and finally getting rid of the implicit leading one // from the mantissa by substracting it out. static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END = {0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000}; static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END = {0x43800000, 0x43800000, 0x43800000, 0x43800000}; static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END = {0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000}; static const int shift_exponent_into_top_mantissa = 8; const __m128 two_n = _mm_and_ps(a, *((__m128 *)float_exponent_mask)); const __m128 n_1 = (__m128)_mm_srli_epi32((__m128i)two_n, shift_exponent_into_top_mantissa); const __m128 n_0 = _mm_or_ps( (__m128)n_1, *((__m128 *)eight_biased_exponent)); const __m128 n = _mm_sub_ps(n_0, *((__m128 *)implicit_leading_one)); // Compute y. static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END = {0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF}; static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END = {0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000}; const __m128 mantissa = _mm_and_ps(a, *((__m128 *)mantissa_mask)); const __m128 y = _mm_or_ps( mantissa, *((__m128 *)zero_biased_exponent_is_one)); // Approximate log2(y) ~= (y - 1) * pol5(y). // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 static const ALIGN16_BEG float ALIGN16_END C5[4] = {-3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f}; static const ALIGN16_BEG float ALIGN16_END C4[4] = {3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f}; static const ALIGN16_BEG float ALIGN16_END C3[4] = {-1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f}; static const ALIGN16_BEG float ALIGN16_END C2[4] = {2.5988452f, 2.5988452f, 2.5988452f, 2.5988452f}; static const ALIGN16_BEG float ALIGN16_END C1[4] = {-3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f}; static const ALIGN16_BEG float ALIGN16_END C0[4] = {3.1157899f, 3.1157899f, 3.1157899f, 3.1157899f}; const __m128 pol5_y_0 = _mm_mul_ps(y, *((__m128 *)C5)); const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128 *)C4)); const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y); const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128 *)C3)); const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y); const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128 *)C2)); const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y); const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128 *)C1)); const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y); const __m128 pol5_y = _mm_add_ps(pol5_y_8, *((__m128 *)C0)); const __m128 y_minus_one = _mm_sub_ps( y, *((__m128 *)zero_biased_exponent_is_one)); const __m128 log2_y = _mm_mul_ps(y_minus_one , pol5_y); // Combine parts. log2_a = _mm_add_ps(n, log2_y); } // b * log2(a) b_log2_a = _mm_mul_ps(b, log2_a); // Calculate exp2(x), x = b * log2(a). { // To calculate 2^x, we decompose x like this: // x = n + y // n is an integer, the value of x - 0.5 rounded down, therefore // y is in the [0.5, 1.5) range // // 2^x = 2^n * 2^y // 2^n can be evaluated by playing with float representation. // 2^y in a small range can be approximated, this code uses an order two // polynomial approximation. The coefficients have been estimated // with the Remez algorithm and the resulting polynomial has a // maximum relative error of 0.17%. // To avoid over/underflow, we reduce the range of input to ]-127, 129]. static const ALIGN16_BEG float max_input[4] ALIGN16_END = {129.f, 129.f, 129.f, 129.f}; static const ALIGN16_BEG float min_input[4] ALIGN16_END = {-126.99999f, -126.99999f, -126.99999f, -126.99999f}; const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128 *)max_input)); const __m128 x_max = _mm_max_ps(x_min, *((__m128 *)min_input)); // Compute n. static const ALIGN16_BEG float half[4] ALIGN16_END = {0.5f, 0.5f, 0.5f, 0.5f}; const __m128 x_minus_half = _mm_sub_ps(x_max, *((__m128 *)half)); const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half); // Compute 2^n. static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END = {127, 127, 127, 127}; static const int float_exponent_shift = 23; const __m128i two_n_exponent = _mm_add_epi32( x_minus_half_floor, *((__m128i *)float_exponent_bias)); const __m128 two_n = (__m128)_mm_slli_epi32( two_n_exponent, float_exponent_shift); // Compute y. const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor)); // Approximate 2^y ~= C2 * y^2 + C1 * y + C0. static const ALIGN16_BEG float C2[4] ALIGN16_END = {3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f}; static const ALIGN16_BEG float C1[4] ALIGN16_END = {6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f}; static const ALIGN16_BEG float C0[4] ALIGN16_END = {1.0017247f, 1.0017247f, 1.0017247f, 1.0017247f}; const __m128 exp2_y_0 = _mm_mul_ps(y, *((__m128 *)C2)); const __m128 exp2_y_1 = _mm_add_ps(exp2_y_0, *((__m128 *)C1)); const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y); const __m128 exp2_y = _mm_add_ps(exp2_y_2, *((__m128 *)C0)); // Combine parts. a_exp_b = _mm_mul_ps(exp2_y, two_n); } return a_exp_b; }
bool AABB::IntersectLineAABB_SSE(const float4 &rayPos, const float4 &rayDir, float tNear, float tFar) const { assume(rayDir.IsNormalized4()); assume(tNear <= tFar && "AABB::IntersectLineAABB: User gave a degenerate line as input for the intersection test!"); /* For reference, this is the C++ form of the vectorized SSE code below. float4 recipDir = rayDir.RecipFast4(); float4 t1 = (aabbMinPoint - rayPos).Mul(recipDir); float4 t2 = (aabbMaxPoint - rayPos).Mul(recipDir); float4 near = t1.Min(t2); float4 far = t1.Max(t2); float4 rayDirAbs = rayDir.Abs(); if (rayDirAbs.x > 1e-4f) // ray is parallel to plane in question { tNear = Max(near.x, tNear); // tNear tracks distance to intersect (enter) the AABB. tFar = Min(far.x, tFar); // tFar tracks the distance to exit the AABB. } else if (rayPos.x < aabbMinPoint.x || rayPos.x > aabbMaxPoint.x) // early-out if the ray can't possibly enter the box. return false; if (rayDirAbs.y > 1e-4f) // ray is parallel to plane in question { tNear = Max(near.y, tNear); // tNear tracks distance to intersect (enter) the AABB. tFar = Min(far.y, tFar); // tFar tracks the distance to exit the AABB. } else if (rayPos.y < aabbMinPoint.y || rayPos.y > aabbMaxPoint.y) // early-out if the ray can't possibly enter the box. return false; if (rayDirAbs.z > 1e-4f) // ray is parallel to plane in question { tNear = Max(near.z, tNear); // tNear tracks distance to intersect (enter) the AABB. tFar = Min(far.z, tFar); // tFar tracks the distance to exit the AABB. } else if (rayPos.z < aabbMinPoint.z || rayPos.z > aabbMaxPoint.z) // early-out if the ray can't possibly enter the box. return false; return tNear < tFar; */ __m128 recipDir = _mm_rcp_ps(rayDir.v); // Note: The above performs an approximate reciprocal (11 bits of precision). // For a full precision reciprocal, perform a div: // __m128 recipDir = _mm_div_ps(_mm_set1_ps(1.f), rayDir.v); __m128 t1 = _mm_mul_ps(_mm_sub_ps(MinPoint_SSE(), rayPos.v), recipDir); __m128 t2 = _mm_mul_ps(_mm_sub_ps(MaxPoint_SSE(), rayPos.v), recipDir); __m128 nearD = _mm_min_ps(t1, t2); // [0 n3 n2 n1] __m128 farD = _mm_max_ps(t1, t2); // [0 f3 f2 f1] // Check if the ray direction is parallel to any of the cardinal axes, and if so, // mask those [near, far] ranges away from the hit test computations. __m128 rayDirAbs = abs_ps(rayDir.v); const __m128 epsilon = _mm_set1_ps(1e-4f); // zeroDirections[i] will be nonzero for each axis i the ray is parallel to. __m128 zeroDirections = _mm_cmple_ps(rayDirAbs, epsilon); const __m128 floatInf = _mm_set1_ps(FLOAT_INF); const __m128 floatNegInf = _mm_set1_ps(-FLOAT_INF); // If the ray is parallel to one of the axes, replace the slab range for that axis // with [-inf, inf] range instead. (which is a no-op in the comparisons below) nearD = cmov_ps(nearD, floatNegInf, zeroDirections); farD = cmov_ps(farD , floatInf, zeroDirections); // Next, we need to compute horizontally max(nearD[0], nearD[1], nearD[2]) and min(farD[0], farD[1], farD[2]) // to see if there is an overlap in the hit ranges. __m128 v1 = _mm_shuffle_ps(nearD, farD, _MM_SHUFFLE(0, 0, 0, 0)); // [f1 f1 n1 n1] __m128 v2 = _mm_shuffle_ps(nearD, farD, _MM_SHUFFLE(1, 1, 1, 1)); // [f2 f2 n2 n2] __m128 v3 = _mm_shuffle_ps(nearD, farD, _MM_SHUFFLE(2, 2, 2, 2)); // [f3 f3 n3 n3] nearD = _mm_max_ps(v1, _mm_max_ps(v2, v3)); farD = _mm_min_ps(v1, _mm_min_ps(v2, v3)); farD = _mm_shuffle_ps(farD, farD, _MM_SHUFFLE(3, 3, 3, 3)); // Unpack the result from high offset in the register. nearD = _mm_max_ps(nearD, _mm_set_ss(tNear)); farD = _mm_min_ps(farD, _mm_set_ss(tFar)); // Finally, test if the ranges overlap. __m128 rangeIntersects = _mm_cmple_ss(nearD, farD); // To store out out the interval of intersection, uncomment the following: // These are disabled, since without these, the whole function runs without a single memory store, // which has been profiled to be very fast! Uncommenting these causes an order-of-magnitude slowdown. // For now, using the SSE version only where the tNear and tFar ranges are not interesting. // _mm_store_ss(&tNear, nearD); // _mm_store_ss(&tFar, farD); // To avoid false positives, need to have an additional rejection test for each cardinal axis the ray direction // is parallel to. __m128 out2 = _mm_cmplt_ps(rayPos.v, MinPoint_SSE()); __m128 out3 = _mm_cmpgt_ps(rayPos.v, MaxPoint_SSE()); out2 = _mm_or_ps(out2, out3); zeroDirections = _mm_and_ps(zeroDirections, out2); __m128 yOut = _mm_shuffle_ps(zeroDirections, zeroDirections, _MM_SHUFFLE(1,1,1,1)); __m128 zOut = _mm_shuffle_ps(zeroDirections, zeroDirections, _MM_SHUFFLE(2,2,2,2)); zeroDirections = _mm_or_ps(_mm_or_ps(zeroDirections, yOut), zOut); // Intersection occurs if the slab ranges had positive overlap and if the test was not rejected by the ray being // parallel to some cardinal axis. __m128 intersects = _mm_andnot_ps(zeroDirections, rangeIntersects); __m128 epsilonMasked = _mm_and_ps(epsilon, intersects); return _mm_comieq_ss(epsilon, epsilonMasked) != 0; }
void process_sse2(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out) { const dt_iop_graduatednd_data_t *const data = (const dt_iop_graduatednd_data_t *const)piece->data; const int ch = piece->colors; const int ix = (roi_in->x); const int iy = (roi_in->y); const float iw = piece->buf_in.width * roi_out->scale; const float ih = piece->buf_in.height * roi_out->scale; const float hw = iw / 2.0; const float hh = ih / 2.0; const float hw_inv = 1.0 / hw; const float hh_inv = 1.0 / hh; const float v = (-data->rotation / 180) * M_PI; const float sinv = sin(v); const float cosv = cos(v); const float filter_radie = sqrt((hh * hh) + (hw * hw)) / hh; const float offset = data->offset / 100.0 * 2; #if 1 const float filter_compression = 1.0 / filter_radie / (1.0 - (0.5 + (data->compression / 100.0) * 0.9 / 2.0)) * 0.5; #else const float compression = data->compression / 100.0f; const float t = 1.0f - .8f / (.8f + compression); const float c = 1.0f + 1000.0f * powf(4.0, compression); #endif if(data->density > 0) { #ifdef _OPENMP #pragma omp parallel for default(none) schedule(static) #endif for(int y = 0; y < roi_out->height; y++) { size_t k = (size_t)roi_out->width * y * ch; const float *in = (float *)ivoid + k; float *out = (float *)ovoid + k; float length = (sinv * (-1.0 + ix * hw_inv) - cosv * (-1.0 + (iy + y) * hh_inv) - 1.0 + offset) * filter_compression; const float length_inc = sinv * hw_inv * filter_compression; __m128 c = _mm_set_ps(0, data->color[2], data->color[1], data->color[0]); __m128 c1 = _mm_sub_ps(_mm_set1_ps(1.0f), c); for(int x = 0; x < roi_out->width; x++, in += ch, out += ch) { #if 1 // !!! approximation is ok only when highest density is 8 // for input x = (data->density * CLIP( 0.5+length ), calculate 2^x as (e^(ln2*x/8))^8 // use exp2f approximation to calculate e^(ln2*x/8) // in worst case - density==8,CLIP(0.5-length) == 1.0 it gives 0.6% of error const float t = 0.693147181f /* ln2 */ * (data->density * CLIP(0.5f + length) / 8.0f); float d1 = t * t * 0.5f; float d2 = d1 * t * 0.333333333f; float d3 = d2 * t * 0.25f; float d = 1 + t + d1 + d2 + d3; /* taylor series for e^x till x^4 */ // printf("%d %d %f\n",y,x,d); __m128 density = _mm_set1_ps(d); density = _mm_mul_ps(density, density); density = _mm_mul_ps(density, density); density = _mm_mul_ps(density, density); #else // use fair exp2f __m128 density = _mm_set1_ps(exp2f(data->density * CLIP(0.5f + length))); #endif /* max(0,in / (c + (1-c)*density)) */ _mm_stream_ps(out, _mm_max_ps(_mm_set1_ps(0.0f), _mm_div_ps(_mm_load_ps(in), _mm_add_ps(c, _mm_mul_ps(c1, density))))); length += length_inc; } } } else { #ifdef _OPENMP #pragma omp parallel for default(none) schedule(static) #endif for(int y = 0; y < roi_out->height; y++) { size_t k = (size_t)roi_out->width * y * ch; const float *in = (float *)ivoid + k; float *out = (float *)ovoid + k; float length = (sinv * (-1.0f + ix * hw_inv) - cosv * (-1.0f + (iy + y) * hh_inv) - 1.0f + offset) * filter_compression; const float length_inc = sinv * hw_inv * filter_compression; __m128 c = _mm_set_ps(0, data->color[2], data->color[1], data->color[0]); __m128 c1 = _mm_sub_ps(_mm_set1_ps(1.0f), c); for(int x = 0; x < roi_out->width; x++, in += ch, out += ch) { #if 1 // !!! approximation is ok only when lowest density is -8 // for input x = (-data->density * CLIP( 0.5-length ), calculate 2^x as (e^(ln2*x/8))^8 // use exp2f approximation to calculate e^(ln2*x/8) // in worst case - density==-8,CLIP(0.5-length) == 1.0 it gives 0.6% of error const float t = 0.693147181f /* ln2 */ * (-data->density * CLIP(0.5f - length) / 8.0f); float d1 = t * t * 0.5f; float d2 = d1 * t * 0.333333333f; float d3 = d2 * t * 0.25f; float d = 1 + t + d1 + d2 + d3; /* taylor series for e^x till x^4 */ __m128 density = _mm_set1_ps(d); density = _mm_mul_ps(density, density); density = _mm_mul_ps(density, density); density = _mm_mul_ps(density, density); #else __m128 density = _mm_set1_ps(exp2f(-data->density * CLIP(0.5f - length))); #endif /* max(0,in * (c + (1-c)*density)) */ _mm_stream_ps(out, _mm_max_ps(_mm_set1_ps(0.0f), _mm_mul_ps(_mm_load_ps(in), _mm_add_ps(c, _mm_mul_ps(c1, density))))); length += length_inc; } } } _mm_sfence(); if(piece->pipe->mask_display) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height); }
// Updates the following smoothed Power Spectral Densities (PSD): // - sd : near-end // - se : residual echo // - sx : far-end // - sde : cross-PSD of near-end and residual echo // - sxd : cross-PSD of near-end and far-end // // In addition to updating the PSDs, also the filter diverge state is determined // upon actions are taken. static void SmoothedPSD(AecCore* aec, float efw[2][PART_LEN1], float dfw[2][PART_LEN1], float xfw[2][PART_LEN1], int* extreme_filter_divergence) { // Power estimate smoothing coefficients. const float* ptrGCoh = aec->extended_filter_enabled ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1] : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1]; int i; float sdSum = 0, seSum = 0; const __m128 vec_15 = _mm_set1_ps(WebRtcAec_kMinFarendPSD); const __m128 vec_GCoh0 = _mm_set1_ps(ptrGCoh[0]); const __m128 vec_GCoh1 = _mm_set1_ps(ptrGCoh[1]); __m128 vec_sdSum = _mm_set1_ps(0.0f); __m128 vec_seSum = _mm_set1_ps(0.0f); for (i = 0; i + 3 < PART_LEN1; i += 4) { const __m128 vec_dfw0 = _mm_loadu_ps(&dfw[0][i]); const __m128 vec_dfw1 = _mm_loadu_ps(&dfw[1][i]); const __m128 vec_efw0 = _mm_loadu_ps(&efw[0][i]); const __m128 vec_efw1 = _mm_loadu_ps(&efw[1][i]); const __m128 vec_xfw0 = _mm_loadu_ps(&xfw[0][i]); const __m128 vec_xfw1 = _mm_loadu_ps(&xfw[1][i]); __m128 vec_sd = _mm_mul_ps(_mm_loadu_ps(&aec->sd[i]), vec_GCoh0); __m128 vec_se = _mm_mul_ps(_mm_loadu_ps(&aec->se[i]), vec_GCoh0); __m128 vec_sx = _mm_mul_ps(_mm_loadu_ps(&aec->sx[i]), vec_GCoh0); __m128 vec_dfw_sumsq = _mm_mul_ps(vec_dfw0, vec_dfw0); __m128 vec_efw_sumsq = _mm_mul_ps(vec_efw0, vec_efw0); __m128 vec_xfw_sumsq = _mm_mul_ps(vec_xfw0, vec_xfw0); vec_dfw_sumsq = _mm_add_ps(vec_dfw_sumsq, _mm_mul_ps(vec_dfw1, vec_dfw1)); vec_efw_sumsq = _mm_add_ps(vec_efw_sumsq, _mm_mul_ps(vec_efw1, vec_efw1)); vec_xfw_sumsq = _mm_add_ps(vec_xfw_sumsq, _mm_mul_ps(vec_xfw1, vec_xfw1)); vec_xfw_sumsq = _mm_max_ps(vec_xfw_sumsq, vec_15); vec_sd = _mm_add_ps(vec_sd, _mm_mul_ps(vec_dfw_sumsq, vec_GCoh1)); vec_se = _mm_add_ps(vec_se, _mm_mul_ps(vec_efw_sumsq, vec_GCoh1)); vec_sx = _mm_add_ps(vec_sx, _mm_mul_ps(vec_xfw_sumsq, vec_GCoh1)); _mm_storeu_ps(&aec->sd[i], vec_sd); _mm_storeu_ps(&aec->se[i], vec_se); _mm_storeu_ps(&aec->sx[i], vec_sx); { const __m128 vec_3210 = _mm_loadu_ps(&aec->sde[i][0]); const __m128 vec_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]); __m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0)); __m128 vec_b = _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(3, 1, 3, 1)); __m128 vec_dfwefw0011 = _mm_mul_ps(vec_dfw0, vec_efw0); __m128 vec_dfwefw0110 = _mm_mul_ps(vec_dfw0, vec_efw1); vec_a = _mm_mul_ps(vec_a, vec_GCoh0); vec_b = _mm_mul_ps(vec_b, vec_GCoh0); vec_dfwefw0011 = _mm_add_ps(vec_dfwefw0011, _mm_mul_ps(vec_dfw1, vec_efw1)); vec_dfwefw0110 = _mm_sub_ps(vec_dfwefw0110, _mm_mul_ps(vec_dfw1, vec_efw0)); vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwefw0011, vec_GCoh1)); vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwefw0110, vec_GCoh1)); _mm_storeu_ps(&aec->sde[i][0], _mm_unpacklo_ps(vec_a, vec_b)); _mm_storeu_ps(&aec->sde[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b)); } { const __m128 vec_3210 = _mm_loadu_ps(&aec->sxd[i][0]); const __m128 vec_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]); __m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0)); __m128 vec_b = _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(3, 1, 3, 1)); __m128 vec_dfwxfw0011 = _mm_mul_ps(vec_dfw0, vec_xfw0); __m128 vec_dfwxfw0110 = _mm_mul_ps(vec_dfw0, vec_xfw1); vec_a = _mm_mul_ps(vec_a, vec_GCoh0); vec_b = _mm_mul_ps(vec_b, vec_GCoh0); vec_dfwxfw0011 = _mm_add_ps(vec_dfwxfw0011, _mm_mul_ps(vec_dfw1, vec_xfw1)); vec_dfwxfw0110 = _mm_sub_ps(vec_dfwxfw0110, _mm_mul_ps(vec_dfw1, vec_xfw0)); vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwxfw0011, vec_GCoh1)); vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwxfw0110, vec_GCoh1)); _mm_storeu_ps(&aec->sxd[i][0], _mm_unpacklo_ps(vec_a, vec_b)); _mm_storeu_ps(&aec->sxd[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b)); } vec_sdSum = _mm_add_ps(vec_sdSum, vec_sd); vec_seSum = _mm_add_ps(vec_seSum, vec_se); } _mm_add_ps_4x1(vec_sdSum, &sdSum); _mm_add_ps_4x1(vec_seSum, &seSum); for (; i < PART_LEN1; i++) { aec->sd[i] = ptrGCoh[0] * aec->sd[i] + ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]); aec->se[i] = ptrGCoh[0] * aec->se[i] + ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]); // We threshold here to protect against the ill-effects of a zero farend. // The threshold is not arbitrarily chosen, but balances protection and // adverse interaction with the algorithm's tuning. // TODO(bjornv): investigate further why this is so sensitive. aec->sx[i] = ptrGCoh[0] * aec->sx[i] + ptrGCoh[1] * WEBRTC_SPL_MAX( xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], WebRtcAec_kMinFarendPSD); aec->sde[i][0] = ptrGCoh[0] * aec->sde[i][0] + ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]); aec->sde[i][1] = ptrGCoh[0] * aec->sde[i][1] + ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]); aec->sxd[i][0] = ptrGCoh[0] * aec->sxd[i][0] + ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]); aec->sxd[i][1] = ptrGCoh[0] * aec->sxd[i][1] + ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]); sdSum += aec->sd[i]; seSum += aec->se[i]; } // Divergent filter safeguard update. aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum; // Signal extreme filter divergence if the error is significantly larger // than the nearend (13 dB). *extreme_filter_divergence = (seSum > (19.95f * sdSum)); }
void sse_matrix(int num_seqs, char **q, int *q_len, int max_q_len, char **r, int *r_len, int max_r_len, float profile[128][128], float gap_open, float gap_extend, float *H, float *F, int *C, float *max_score) { const int depth = 4; __m128 h_simd, e_simd, f_simd, diagonal_simd; __m128 temp_simd, subst_simd; __m128i zeroi = _mm_set_epi32(0, 0, 0, 0); __m128 score_simd = _mm_setzero_ps(); __m128 zero_simd = _mm_setzero_ps(); __m128 one_simd = _mm_set1_ps(1); __m128 gap_open_simd = _mm_set1_ps(gap_open); __m128 gap_extend_simd = _mm_set1_ps(gap_extend); __m128 max_de, max_fz; __m128 cmp_de, cmp_fz, cmp_de_fz; __m128i c; int offset, index, idx, j_depth; int q_len_depth = depth * max_q_len; /* for (int i = 0; i < 4; i++) { printf("query %i:%s\nref. %i:%s\n\n", i, q[i], i, r[i]); } */ h_simd = zero_simd; e_simd = zero_simd; for (int j = 0; j < max_q_len; j++) { j_depth = depth * j; // left value: gap in reference e_simd = _mm_max_ps(_mm_sub_ps(e_simd, gap_extend_simd), _mm_sub_ps(h_simd, gap_open_simd)); // printf("from left: %0.2f\n", ((float *)&e_simd)[0]); // diagonal value: match or mismatch subst_simd = _mm_set_ps((q_len[3] > j) ? profile[q[3][j]][r[3][0]] : -1000.0f, (q_len[2] > j) ? profile[q[2][j]][r[2][0]] : -1000.0f, (q_len[1] > j) ? profile[q[1][j]][r[1][0]] : -1000.0f, (q_len[0] > j) ? profile[q[0][j]][r[0][0]] : -1000.0f); /* subst_simd = _mm_set_ps(profile[q[3][j]][r[3][0]], profile[q[2][j]][r[2][0]], profile[q[1][j]][r[1][0]], profile[q[0][j]][r[0][0]]); */ diagonal_simd = _mm_add_ps(zero_simd, subst_simd); // printf("from diagonal: temp = %0.2f %0.2f (%c, %c) -> %0.2f\n", ((float *)&temp_simd)[0], profile[q[0][j]][r[0][0]], q[0][j], r[0][0], ((float *)&diagonal_simd)[0]); cmp_de = _mm_min_ps(_mm_cmpge_ps(diagonal_simd, e_simd), one_simd); max_de = _mm_max_ps(diagonal_simd, e_simd); // up value: gap in query f_simd = _mm_max_ps(_mm_sub_ps(zero_simd, gap_extend_simd), _mm_sub_ps(zero_simd, gap_open_simd)); cmp_fz = _mm_min_ps(_mm_cmpge_ps(f_simd, zero_simd), one_simd); max_fz = _mm_max_ps(f_simd, zero_simd); // printf("from up: %0.2f\n", ((float *)&f_simd)[0]); // get max. value and save it cmp_de_fz = _mm_min_ps(_mm_cmpge_ps(max_de, max_fz), one_simd); h_simd = _mm_max_ps(max_de, max_fz); score_simd = _mm_max_ps(score_simd, h_simd); // printf("\t\t\t\t\tmax. score: %0.2f\n", ((float *)&h_simd)[0]); // compass (save left, diagonal, up or zero?) c = _mm_slli_epi32(_mm_or_si128(zeroi, _mm_cvtps_epi32(cmp_de)), 1); c = _mm_slli_epi32(_mm_or_si128(c, _mm_cvtps_epi32(cmp_fz)), 1); c = _mm_or_si128(c, _mm_cvtps_epi32(cmp_de_fz)); // printf("\t\t\t\t\tcompass: %i\n", ((int *)&c)[0]); // update matrices _mm_store_ps(&H[j_depth], h_simd); _mm_store_ps(&F[j_depth], f_simd); _mm_store_si128((__m128i *)&C[j_depth], c); //_mm_store_ps(&D[j_depth], diagonal_simd); /* offset = j_depth; printf("(row, col) = (%i, %i):\t \t%c-%c=%0.2f %c-%c=%0.2f %c-%c=%0.2f %c-%c=%0.2f\n", 0, j, q[0][j], r[0][0], profile[q[0][j]][r[0][0]], q[1][j], r[1][0], profile[q[1][j]][r[1][0]], q[2][j], r[2][0], profile[q[2][j]][r[2][0]], q[3][j], r[3][0], profile[q[3][j]][r[3][0]]); printf("(row, col) = (%i, %i):\tH\t%0.2f %0.2f %0.2f %0.2f\n", 0, j, H[offset], H[offset+1], H[offset+2], H[offset+3]); printf("(row, col) = (%i, %i):\tD\t%0.2f %0.2f %0.2f %0.2f\n", 0, j, D[offset], D[offset+1], D[offset+2], D[offset+3]); printf("(row, col) = (%i, %i):\td\t%0.2f %0.2f %0.2f %0.2f\n", 0, j, ((float *)&diagonal_simd)[0], ((float *)&diagonal_simd)[1], ((float *)&diagonal_simd)[2], ((float *)&diagonal_simd)[3]); printf("(row, col) = (%i, %i):\ts\t%0.2f %0.2f %0.2f %0.2f\n", 0, j, ((float *)&subst_simd)[0], ((float *)&subst_simd)[1], ((float *)&subst_simd)[2], ((float *)&subst_simd)[3]); */ } // printf("\n"); // exit(-1); int target = 0; for (int i = 1; i < max_r_len; i++) { h_simd = zero_simd; e_simd = zero_simd; temp_simd = zero_simd; idx = i * q_len_depth; for (int j = 0; j < max_q_len; j++) { j_depth = depth * j; offset = idx + j_depth; // left value: gap in reference e_simd = _mm_max_ps(_mm_sub_ps(e_simd, gap_extend_simd), _mm_sub_ps(h_simd, gap_open_simd)); // if (i == 3 && j == 3) printf("from left: %0.2f\n", ((float *)&e_simd)[target]); // diagonal value: match or mismatch diagonal_simd = _mm_add_ps(temp_simd, _mm_set_ps((q_len[3] > j && r_len[3] > i) ? profile[q[3][j]][r[3][i]] : -1000.0f, (q_len[2] > j && r_len[2] > i) ? profile[q[2][j]][r[2][i]] : -1000.0f, (q_len[1] > j && r_len[1] > i) ? profile[q[1][j]][r[1][i]] : -1000.0f, (q_len[0] > j && r_len[0] > i) ? profile[q[0][j]][r[0][i]] : -1000.0f) ); cmp_de = _mm_min_ps(_mm_cmpge_ps(diagonal_simd, e_simd), one_simd); max_de = _mm_max_ps(diagonal_simd, e_simd); // if (i == 3 && j == 3) printf("from diagonal: temp = %0.2f %0.2f (%c, %c) -> %0.2f\n", ((float *)&temp_simd)[target], profile[q[target][j]][r[target][i]], q[target][j], r[target][i], ((float *)&diagonal_simd)[target]); // up value: gap in query temp_simd = _mm_load_ps(&H[offset - q_len_depth]); f_simd = _mm_load_ps(&F[j_depth]); f_simd = _mm_max_ps(_mm_sub_ps(f_simd, gap_extend_simd), _mm_sub_ps(temp_simd, gap_open_simd)); cmp_fz = _mm_min_ps(_mm_cmpge_ps(f_simd, zero_simd), one_simd); max_fz = _mm_max_ps(f_simd, zero_simd); // if (i == 3 && j == 3) printf("from up: %0.2f\n", ((float *)&f_simd)[target]); // get max. value cmp_de_fz = _mm_min_ps(_mm_cmpge_ps(max_de, max_fz), one_simd); h_simd = _mm_max_ps(max_de, max_fz); score_simd = _mm_max_ps(score_simd, h_simd); // if (i == 3 && j == 3) printf("\t\t\t\t\tmax. score: %0.2f\n", ((float *)&h_simd)[target]); // compass (save left, diagonal, up or zero?) c = _mm_slli_epi32(_mm_or_si128(zeroi, _mm_cvtps_epi32(cmp_de)), 1); c = _mm_slli_epi32(_mm_or_si128(c, _mm_cvtps_epi32(cmp_fz)), 1); c = _mm_or_si128(c, _mm_cvtps_epi32(cmp_de_fz)); // update matrices _mm_store_ps(&H[offset], h_simd); _mm_store_ps(&F[j_depth], f_simd); _mm_store_si128((__m128i *)&C[offset], c); /* if (j==0) { printf("(row, col) = (%i, %i):\tD\t%0.2f %0.2f %0.2f %0.2f\n", i, j, D[offset], D[offset+1], D[offset+2], D[offset+3]); printf("(row, col) = (%i, %i):\tH\t%0.2f %0.2f %0.2f %0.2f\n", i, j, H[offset], H[offset+1], H[offset+2], H[offset+3]); } */ // printf("(row, col) = (%i, %i):\t%0.2f %0.2f %0.2f %0.2f\n", i, j, H[offset], H[offset+1], H[offset+2], H[offset+3]); } // printf("\n"); } _mm_store_ps(max_score, score_simd); /* int rr_len = r_len[0]; int qq_len = q_len[0]; printf("r_len[0] = %i, q_len[0] = %i\n", rr_len, qq_len); printf("sse\n"); for (int i = 0; i < rr_len; i++) { printf("\t"); for (int j = 0; j < qq_len; j++) { printf("%0.2f\t", H[(i * max_q_len * 4) + (j * 4)]); } printf("\n"); } */ /* char filename[200]; for (int i = 0; i < 4; i++) { sprintf(filename, "/tmp/sse1-%i.score", i); save_float_matrix(H, max_q_len, max_r_len, q[i], q_len[i], r[i], r_len[i], i, 4, filename); } */ /* for (int i = 0; i < 4; i++) { printf("score %i:%0.2f\n\n", i, max_score[i]); } */ }
void convert_to_rgb_fast() { unsigned i,j,c; int row, col, k; ushort *img; float out_cam[3][4]; double num, inverse[3][3]; static const double xyzd50_srgb[3][3] = { { 0.436083, 0.385083, 0.143055 }, { 0.222507, 0.716888, 0.060608 }, { 0.013930, 0.097097, 0.714022 } }; static const double rgb_rgb[3][3] = { { 1,0,0 }, { 0,1,0 }, { 0,0,1 } }; static const double adobe_rgb[3][3] = { { 0.715146, 0.284856, 0.000000 }, { 0.000000, 1.000000, 0.000000 }, { 0.000000, 0.041166, 0.958839 } }; static const double wide_rgb[3][3] = { { 0.593087, 0.404710, 0.002206 }, { 0.095413, 0.843149, 0.061439 }, { 0.011621, 0.069091, 0.919288 } }; static const double prophoto_rgb[3][3] = { { 0.529317, 0.330092, 0.140588 }, { 0.098368, 0.873465, 0.028169 }, { 0.016879, 0.117663, 0.865457 } }; static const double (*out_rgb[])[3] = { rgb_rgb, adobe_rgb, wide_rgb, prophoto_rgb, xyz_rgb }; static const char *name[] = { "sRGB", "Adobe RGB (1998)", "WideGamut D65", "ProPhoto D65", "XYZ" }; static const unsigned phead[] = { 1024, 0, 0x2100000, 0x6d6e7472, 0x52474220, 0x58595a20, 0, 0, 0, 0x61637370, 0, 0, 0x6e6f6e65, 0, 0, 0, 0, 0xf6d6, 0x10000, 0xd32d }; unsigned pbody[] = { 10, 0x63707274, 0, 36, /* cprt */ 0x64657363, 0, 40, /* desc */ 0x77747074, 0, 20, /* wtpt */ 0x626b7074, 0, 20, /* bkpt */ 0x72545243, 0, 14, /* rTRC */ 0x67545243, 0, 14, /* gTRC */ 0x62545243, 0, 14, /* bTRC */ 0x7258595a, 0, 20, /* rXYZ */ 0x6758595a, 0, 20, /* gXYZ */ 0x6258595a, 0, 20 }; /* bXYZ */ static const unsigned pwhite[] = { 0xf351, 0x10000, 0x116cc }; unsigned pcurve[] = { 0x63757276, 0, 1, 0x1000000 }; gamma_curve (gamm[0], gamm[1], 0, 0); memcpy (out_cam, rgb_cam, sizeof out_cam); raw_color |= colors == 1 || document_mode || output_color < 1 || output_color > 5; if (!raw_color) { oprof = (unsigned *) calloc (phead[0], 1); merror (oprof, "convert_to_rgb()"); memcpy (oprof, phead, sizeof phead); if (output_color == 5) oprof[4] = oprof[5]; oprof[0] = 132 + 12*pbody[0]; for (i=0; i < pbody[0]; i++) { oprof[oprof[0]/4] = i ? (i > 1 ? 0x58595a20 : 0x64657363) : 0x74657874; pbody[i*3+2] = oprof[0]; oprof[0] += (pbody[i*3+3] + 3) & -4; } memcpy (oprof+32, pbody, sizeof pbody); oprof[pbody[5]/4+2] = strlen(name[output_color-1]) + 1; memcpy ((char *)oprof+pbody[8]+8, pwhite, sizeof pwhite); pcurve[3] = (short)(256/gamm[5]+0.5) << 16; for (i=4; i < 7; i++) memcpy ((char *)oprof+pbody[i*3+2], pcurve, sizeof pcurve); pseudoinverse ((double (*)[3])out_rgb[output_color-1], inverse, 3); for (i=0; i < 3; i++) for (j=0; j < 3; j++) { for (num = k=0; k < 3; k++) num += xyzd50_srgb[i][k] * inverse[j][k]; oprof[pbody[j*3+23]/4+i+2] = num * 0x10000 + 0.5; } for (i=0; i < phead[0]/4; i++) oprof[i] = htonl(oprof[i]); strcpy ((char *)oprof+pbody[2]+8, "auto-generated by dcraw"); strcpy ((char *)oprof+pbody[5]+12, name[output_color-1]); for (i=0; i < 3; i++) for (j=0; j < colors; j++) for (out_cam[i][j] = k=0; k < 3; k++) out_cam[i][j] += out_rgb[output_color-1][i][k] * rgb_cam[k][j]; } if (verbose) fprintf (stderr, raw_color ? _("Building histograms...\n") : _("Converting to %s colorspace...\n"), name[output_color-1]); memset (histogram, 0, sizeof histogram); if(!raw_color) { __m128 outcam0= {out_cam[0][0],out_cam[1][0],out_cam[2][0],0}, outcam1= {out_cam[0][1],out_cam[1][1],out_cam[2][1],0}, outcam2= {out_cam[0][2],out_cam[1][2],out_cam[2][2],0}, outcam3= {out_cam[0][3],out_cam[1][3],out_cam[2][3],0}; for (img=image[0]; img < image[width*height]; img+=4) { __m128 out0; __m128 vimg0 = {img[0],img[0],img[0],0}, vimg1 = {img[1],img[1],img[1],0}, vimg2 = {img[2],img[2],img[2],0}, vimg3 = {img[3],img[3],img[3],0}; // out[0] = out_cam[0][0] * img[0] // +out_cam[0][1] * img[1] // +out_cam[0][2] * img[2] // +out_cam[0][3] * img[3]; // out[1] = out_cam[1][0] * img[0] // +out_cam[1][1] * img[1] // +out_cam[1][2] * img[2] // +out_cam[1][3] * img[3]; // out[2] = out_cam[2][0] * img[0] // +out_cam[2][1] * img[1] // +out_cam[2][2] * img[2] // +out_cam[2][3] * img[3]; out0 = _mm_add_ps(_mm_add_ps( _mm_mul_ps(vimg0, outcam0), _mm_mul_ps(vimg1, outcam1) ), _mm_add_ps( _mm_mul_ps(vimg2, outcam2), _mm_mul_ps(vimg3, outcam3) )); //clip out0 = _mm_max_ps(_mm_set1_ps(0), _mm_min_ps(_mm_set1_ps(0xffff), _mm_round_ps(out0, _MM_FROUND_TO_ZERO))); __m128i o = _mm_cvtps_epi32(out0); o = _mm_packus_epi32(o,_mm_setzero_si128()); memcpy(img, &o, sizeof(short)*3); FORCC histogram[c][img[c] >> 3]++; } } else if (document_mode) {
inline vec4 max(vec4 a, vec4 b) { return _mm_max_ps(a, b); }
inline void ApplyClamp(__m128& pix) { pix = _mm_min_ps(_mm_max_ps(pix, EZERO), EONE); }
_declspec(dllexport) DiffResult __stdcall diff_img(Image left, Image right, DiffOptions options) { if (options.ignoreColor) { makeGreyscale(left); makeGreyscale(right); } float* imgMem = (float*)_aligned_malloc(left.width * left.height * sizeof(float) * 4, 16); int colorOffset = left.width * left.height; Image diff = { left.width, left.height, left.stride, imgMem, imgMem + colorOffset, imgMem + colorOffset * 2, imgMem + colorOffset * 3 }; float* drp = diff.r; float* dgp = diff.g; float* dbp = diff.b; float* dap = diff.a; float* lrp = left.r; float* lgp = left.g; float* lbp = left.b; float* lap = left.a; float* rrp = right.r; float* rgp = right.g; float* rbp = right.b; float* rap = right.a; Color error = ConvertToFloat(options.errorColor); auto er = _mm_set_ps1(error.r); auto eg = _mm_set_ps1(error.g); auto eb = _mm_set_ps1(error.b); auto ea = _mm_set_ps1(error.a); auto tolerance = _mm_set_ps1(options.tolerance); auto overlayTransparency = _mm_set_ps1(options.overlayTransparency); OverlayType overlayType = options.overlayType; byte weightByDiffPercentage = options.weightByDiffPercentage; auto diffPixelCount = _mm_set_epi32(0, 0, 0, 0); auto onei = _mm_set1_epi32(1); auto one = _mm_set1_ps(1); auto zero = _mm_set1_ps(0); for (int y = 0; y < left.height; y++) { for (int x = 0; x < left.width; x+=4) { auto lr = _mm_load_ps(lrp); auto lg = _mm_load_ps(lgp); auto lb = _mm_load_ps(lbp); auto la = _mm_load_ps(lap); auto rr = _mm_load_ps(rrp); auto rg = _mm_load_ps(rgp); auto rb = _mm_load_ps(rbp); auto ra = _mm_load_ps(rap); auto rdiff = _mm_sub_ps(rr, lr); auto gdiff = _mm_sub_ps(rg, lg); auto bdiff = _mm_sub_ps(rb, lb); auto adiff = _mm_sub_ps(ra, la); auto distance = _mm_mul_ps(rdiff, rdiff); distance = _mm_add_ps(distance, _mm_mul_ps(gdiff, gdiff)); distance = _mm_add_ps(distance, _mm_mul_ps(bdiff, bdiff)); distance = _mm_add_ps(distance, _mm_mul_ps(adiff, adiff)); distance = _mm_sqrt_ps(distance); auto t = overlayTransparency; if (weightByDiffPercentage) { t = _mm_mul_ps(t, distance); } auto isdiff = _mm_cmpgt_ps(distance, tolerance); t = _mm_min_ps(one, _mm_max_ps(zero, t)); auto mlr = rr; auto mlg = rg; auto mlb = rb; auto mla = ra; if (overlayType == OverlayType::Movement) { mlr = _mm_mul_ps(mlr, er); mlg = _mm_mul_ps(mlg, eg); mlb = _mm_mul_ps(mlb, eb); mla = _mm_mul_ps(mla, ea); } auto oneMinusT = _mm_sub_ps(one, t); auto mixedR = _mm_add_ps(_mm_mul_ps(mlr, oneMinusT), _mm_mul_ps(er, t)); auto mixedG = _mm_add_ps(_mm_mul_ps(mlg, oneMinusT), _mm_mul_ps(eg, t)); auto mixedB = _mm_add_ps(_mm_mul_ps(mlb, oneMinusT), _mm_mul_ps(eb, t)); auto mixedA = one; if (overlayType != OverlayType::Movement) { mixedA = _mm_add_ps(_mm_mul_ps(mla, oneMinusT), _mm_mul_ps(ea, t)); } // (((b ^ a) & mask)^a) auto dr = _mm_xor_ps(lr, _mm_and_ps(isdiff, _mm_xor_ps(mixedR, lr))); auto dg = _mm_xor_ps(lg, _mm_and_ps(isdiff, _mm_xor_ps(mixedG, lg))); auto db = _mm_xor_ps(lb, _mm_and_ps(isdiff, _mm_xor_ps(mixedB, lb))); auto da = _mm_xor_ps(la, _mm_and_ps(isdiff, _mm_xor_ps(mixedA, la))); diffPixelCount = _mm_xor_si128(diffPixelCount, _mm_and_si128(_mm_castps_si128(isdiff), _mm_xor_si128(_mm_add_epi32(diffPixelCount, onei), diffPixelCount))); _mm_store_ps(drp, dr); _mm_store_ps(dgp, dg); _mm_store_ps(dbp, db); _mm_store_ps(dap, da); drp+=4; dgp+=4; dbp+=4; dap+=4; lrp+=4; lgp+=4; lbp+=4; lap+=4; rrp+=4; rgp+=4; rbp+=4; rap+=4; } } int* pixelCounts = (int*)_aligned_malloc(4 * sizeof(int), 16); _mm_store_si128((__m128i*)pixelCounts, diffPixelCount); int totalCount = pixelCounts[0] + pixelCounts[1] + pixelCounts[2] + pixelCounts[3]; _aligned_free(pixelCounts); return{ diff, 1.0f - float(totalCount) / (left.height * left.width - left.height * left.stride) }; }
void ConvertVideoFrame420ToRGB( const th_info *tinfo, const th_ycbcr_buffer ycbcr, unsigned char* pixels ) { // some constant definitions const float single_yoffset = 16.0f; const float single_yexcursion = 219.0f; const float single_cboffset = 128.0f; const float single_cbexcursion = 224.0f; const float single_croffset = 128.0f; const float single_crexcursion = 224.0f; const float kr = 0.299f; const float kb = 0.114f; if (pixels) { const th_img_plane yplane = ycbcr[0]; const th_img_plane cbplane = ycbcr[1]; const th_img_plane crplane = ycbcr[2]; const int width = tinfo->pic_width; const int height = tinfo->pic_height; const int wh = width*height; assert(wh == yplane.width*yplane.height); assert(width % 16 == 0); assert(cbplane.width * 2 == yplane.width); assert(crplane.width * 2 == yplane.width); const unsigned char* ydata = yplane.data; const unsigned char* cbdata = cbplane.data; const unsigned char* crdata = crplane.data; const int ystride = yplane.stride; const int cbstride = cbplane.stride; const int crstride = crplane.stride; const __m128 yoffset = _mm_set_ps1(-single_yoffset); const __m128 yexcursion = _mm_set_ps1(1.0f / single_yexcursion); const __m128 cboffset = _mm_set_ps1(-single_cboffset); const __m128 cbexcursion = _mm_set_ps1(1.0f / single_cbexcursion); const __m128 croffset = _mm_set_ps1(-single_croffset); const __m128 crexcursion = _mm_set_ps1(1.0f / single_crexcursion); const __m128 fr = _mm_set_ps1(255.0f * 2 * (1 - kr)); const __m128 fb = _mm_set_ps1(255.0f * 2 * (1 - kb)); const __m128 f1 = _mm_set_ps1(255.0f * (2 * (1 - kb) * kb / (1 - kb - kr))); const __m128 f2 = _mm_set_ps1(255.0f * (2 * (1 - kr) * kr / (1 - kb - kr))); const __m128 c255 = _mm_set_ps1(255.0f); for(int h = 0; h < height; ++h) { for(int w = 0; w < width; w += 16) { const __m128i yIn = _mm_loadu_si128((const __m128i*)(ydata + h*ystride + w)); // assumption is that there is only one pixel in the cb/cr plane per 4 pixels (2x2) in the y plane const __m128i cbIn = _mm_loadu_si128((const __m128i*)(cbdata + h/2*cbstride + w/2)); const __m128i crIn = _mm_loadu_si128((const __m128i*)(crdata + h/2*crstride + w/2)); // yIn ep8 -> ps const __m128i yInlo = _mm_unpacklo_epi8((yIn), _mm_setzero_si128()); const __m128i yInHi = _mm_unpackhi_epi8((yIn), _mm_setzero_si128()); const __m128i yIn1 = _mm_unpacklo_epi16(yInlo, _mm_setzero_si128()); const __m128i yIn4 = _mm_unpackhi_epi16(yInlo, _mm_setzero_si128()); const __m128i yIn8 = _mm_unpacklo_epi16(yInHi, _mm_setzero_si128()); const __m128i yIn12 = _mm_unpackhi_epi16(yInHi, _mm_setzero_si128()); const __m128 yIn1ps = _mm_cvtepi32_ps(yIn1); const __m128 yIn2ps = _mm_cvtepi32_ps(yIn4); const __m128 yIn3ps = _mm_cvtepi32_ps(yIn8); const __m128 yIn4ps = _mm_cvtepi32_ps(yIn12); // cbIn ep8 -> ps const __m128i cbInExp = _mm_unpacklo_epi8(cbIn, cbIn); const __m128i cbInlo = _mm_unpacklo_epi8(cbInExp, _mm_setzero_si128()); const __m128i cbInHi = _mm_unpackhi_epi8(cbInExp, _mm_setzero_si128()); const __m128i cbIn1 = _mm_unpacklo_epi16(cbInlo, _mm_setzero_si128()); const __m128i cbIn4 = _mm_unpackhi_epi16(cbInlo, _mm_setzero_si128()); const __m128i cbIn8 = _mm_unpacklo_epi16(cbInHi, _mm_setzero_si128()); const __m128i cbIn12 = _mm_unpackhi_epi16(cbInHi, _mm_setzero_si128()); const __m128 cbIn1ps = _mm_cvtepi32_ps(cbIn1); const __m128 cbIn2ps = _mm_cvtepi32_ps(cbIn4); const __m128 cbIn3ps = _mm_cvtepi32_ps(cbIn8); const __m128 cbIn4ps = _mm_cvtepi32_ps(cbIn12); // crIn ep8 -> ps const __m128i crInExp = _mm_unpacklo_epi8(crIn, crIn); const __m128i crInlo = _mm_unpacklo_epi8(crInExp, _mm_setzero_si128()); const __m128i crInHi = _mm_unpackhi_epi8(crInExp, _mm_setzero_si128()); const __m128i crIn1 = _mm_unpacklo_epi16(crInlo, _mm_setzero_si128()); const __m128i crIn4 = _mm_unpackhi_epi16(crInlo, _mm_setzero_si128()); const __m128i crIn8 = _mm_unpacklo_epi16(crInHi, _mm_setzero_si128()); const __m128i crIn12 = _mm_unpackhi_epi16(crInHi, _mm_setzero_si128()); const __m128 crIn1ps = _mm_cvtepi32_ps(crIn1); const __m128 crIn2ps = _mm_cvtepi32_ps(crIn4); const __m128 crIn3ps = _mm_cvtepi32_ps(crIn8); const __m128 crIn4ps = _mm_cvtepi32_ps(crIn12); // map [0..255] to [-1/2..+1/2] resp. [0..1] const __m128 yOut1ps = _mm_mul_ps(_mm_add_ps(yIn1ps, yoffset), yexcursion); const __m128 yOut2ps = _mm_mul_ps(_mm_add_ps(yIn2ps, yoffset), yexcursion); const __m128 yOut3ps = _mm_mul_ps(_mm_add_ps(yIn3ps, yoffset), yexcursion); const __m128 yOut4ps = _mm_mul_ps(_mm_add_ps(yIn4ps, yoffset), yexcursion); const __m128 cbOut1ps = _mm_mul_ps(_mm_add_ps(cbIn1ps, cboffset), cbexcursion); const __m128 cbOut2ps = _mm_mul_ps(_mm_add_ps(cbIn2ps, cboffset), cbexcursion); const __m128 cbOut3ps = _mm_mul_ps(_mm_add_ps(cbIn3ps, cboffset), cbexcursion); const __m128 cbOut4ps = _mm_mul_ps(_mm_add_ps(cbIn4ps, cboffset), cbexcursion); const __m128 crOut1ps = _mm_mul_ps(_mm_add_ps(crIn1ps, croffset), crexcursion); const __m128 crOut2ps = _mm_mul_ps(_mm_add_ps(crIn2ps, croffset), crexcursion); const __m128 crOut3ps = _mm_mul_ps(_mm_add_ps(crIn3ps, croffset), crexcursion); const __m128 crOut4ps = _mm_mul_ps(_mm_add_ps(crIn4ps, croffset), crexcursion); // do the actual conversion math (on range 0..255/-127..127 instead or 0..1/-1/2..+1/2 const __m128 y1_255 = _mm_mul_ps(c255, yOut1ps); const __m128 y2_255 = _mm_mul_ps(c255, yOut2ps); const __m128 y3_255 = _mm_mul_ps(c255, yOut3ps); const __m128 y4_255 = _mm_mul_ps(c255, yOut4ps); const __m128 r1_1 = _mm_add_ps(y1_255, _mm_mul_ps(fr, crOut1ps)); const __m128 r2_1 = _mm_add_ps(y2_255, _mm_mul_ps(fr, crOut2ps)); const __m128 r3_1 = _mm_add_ps(y3_255, _mm_mul_ps(fr, crOut3ps)); const __m128 r4_1 = _mm_add_ps(y4_255, _mm_mul_ps(fr, crOut4ps)); const __m128 g1_1 = _mm_sub_ps(_mm_sub_ps(y1_255, _mm_mul_ps(f1, cbOut1ps)), _mm_mul_ps(f2, crOut1ps)); const __m128 g2_1 = _mm_sub_ps(_mm_sub_ps(y2_255, _mm_mul_ps(f1, cbOut2ps)), _mm_mul_ps(f2, crOut2ps)); const __m128 g3_1 = _mm_sub_ps(_mm_sub_ps(y3_255, _mm_mul_ps(f1, cbOut3ps)), _mm_mul_ps(f2, crOut3ps)); const __m128 g4_1 = _mm_sub_ps(_mm_sub_ps(y4_255, _mm_mul_ps(f1, cbOut4ps)), _mm_mul_ps(f2, crOut4ps)); const __m128 b1_1 = _mm_add_ps(y1_255, _mm_mul_ps(fb, cbOut1ps)); const __m128 b2_1 = _mm_add_ps(y2_255, _mm_mul_ps(fb, cbOut2ps)); const __m128 b3_1 = _mm_add_ps(y3_255, _mm_mul_ps(fb, cbOut3ps)); const __m128 b4_1 = _mm_add_ps(y4_255, _mm_mul_ps(fb, cbOut4ps)); // clip to 255 const __m128 r1 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, r1_1)); const __m128 r2 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, r2_1)); const __m128 r3 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, r3_1)); const __m128 r4 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, r4_1)); const __m128 g1 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, g1_1)); const __m128 g2 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, g2_1)); const __m128 g3 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, g3_1)); const __m128 g4 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, g4_1)); const __m128 b1 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, b1_1)); const __m128 b2 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, b2_1)); const __m128 b3 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, b3_1)); const __m128 b4 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, b4_1)); // multiplex rgb channels #define rgb_multiplex(no) \ const __m128 rgb##no##_1 = _mm_shuffle_ps( \ _mm_shuffle_ps(b##no##, g##no##, _MM_SHUFFLE(0, 0, 0, 0)), \ _mm_shuffle_ps(r##no##, b##no##, _MM_SHUFFLE(1, 1, 0, 0)), \ _MM_SHUFFLE(2, 0, 2, 0)); \ const __m128 rgb##no##_2 = _mm_shuffle_ps( \ _mm_shuffle_ps(g##no##, r##no##, _MM_SHUFFLE(1, 1, 1, 1)), \ _mm_shuffle_ps(b##no##, g##no##, _MM_SHUFFLE(2, 2, 2, 2)), \ _MM_SHUFFLE(2, 0, 2, 0)); \ const __m128 rgb##no##_3 = _mm_shuffle_ps( \ _mm_shuffle_ps(r##no##, b##no##, _MM_SHUFFLE(3, 3, 2, 2)), \ _mm_shuffle_ps(g##no##, r##no##, _MM_SHUFFLE(3, 3, 3, 3)), \ _MM_SHUFFLE(2, 0, 2, 0)); rgb_multiplex(1); rgb_multiplex(2); rgb_multiplex(3); rgb_multiplex(4); #undef rgb_multiplex // pack 32bit -> 8bit const __m128i pack1l = _mm_packs_epi32(_mm_cvtps_epi32(rgb1_1), _mm_cvtps_epi32(rgb1_2)); const __m128i pack1h = _mm_packs_epi32(_mm_cvtps_epi32(rgb1_3), _mm_cvtps_epi32(rgb2_1)); const __m128i pack1 = _mm_packus_epi16(pack1l, pack1h); const __m128i pack2l = _mm_packs_epi32(_mm_cvtps_epi32(rgb2_2), _mm_cvtps_epi32(rgb2_3)); const __m128i pack2h = _mm_packs_epi32(_mm_cvtps_epi32(rgb3_1), _mm_cvtps_epi32(rgb3_2)); const __m128i pack2 = _mm_packus_epi16(pack2l, pack2h); const __m128i pack3l = _mm_packs_epi32(_mm_cvtps_epi32(rgb3_3), _mm_cvtps_epi32(rgb4_1)); const __m128i pack3h = _mm_packs_epi32(_mm_cvtps_epi32(rgb4_2), _mm_cvtps_epi32(rgb4_3)); const __m128i pack3 = _mm_packus_epi16(pack3l, pack3h); // and finally store in output _mm_storeu_si128((__m128i*)(pixels + ((wh-width)*3) - h*width*3 + w*3 + 0*16), pack1); _mm_storeu_si128((__m128i*)(pixels + ((wh-width)*3) - h*width*3 + w*3 + 1*16), pack2); _mm_storeu_si128((__m128i*)(pixels + ((wh-width)*3) - h*width*3 + w*3 + 2*16), pack3); } } } // if }
RETf MAX_SSE(const __m128 x, const __m128 y) { return _mm_max_ps(x, y); }
static __m128i cielabv (union hvrgbpix rgb) { __m128 xvxyz[2] = {_mm_set1_ps(0.5),_mm_set1_ps(0.5) }; //,0.5,0.5,0.5); __m128 vcam0 = _mm_setr_ps(cielab_xyz_cam[0][0],cielab_xyz_cam[1][0],cielab_xyz_cam[2][0],0); __m128 vcam1 = _mm_setr_ps(cielab_xyz_cam[0][1],cielab_xyz_cam[1][1],cielab_xyz_cam[2][1],0); __m128 vcam2 = _mm_setr_ps(cielab_xyz_cam[0][2],cielab_xyz_cam[1][2],cielab_xyz_cam[2][2],0); __m128 vrgb0h = _mm_set1_ps(rgb.h.c[0]); __m128 vrgb1h = _mm_set1_ps(rgb.h.c[1]); __m128 vrgb2h = _mm_set1_ps(rgb.h.c[2]); __m128 vrgb0v = _mm_set1_ps(rgb.v.c[0]); __m128 vrgb1v = _mm_set1_ps(rgb.v.c[1]); __m128 vrgb2v = _mm_set1_ps(rgb.v.c[2]); xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam0,vrgb0h)); xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam1,vrgb1h)); xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam2,vrgb2h)); xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam0,vrgb0v)); xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam1,vrgb1v)); xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam2,vrgb2v)); xvxyz[0] = _mm_max_ps(_mm_set1_ps(0), _mm_min_ps(_mm_set1_ps(0xffff), _mm_round_ps(xvxyz[0], _MM_FROUND_TO_ZERO))); xvxyz[1] = _mm_max_ps(_mm_set1_ps(0), _mm_min_ps(_mm_set1_ps(0xffff), _mm_round_ps(xvxyz[1], _MM_FROUND_TO_ZERO))); __m128i loadaddrh = _mm_cvttps_epi32(xvxyz[0]); __m128i loadaddrv = _mm_cvttps_epi32(xvxyz[1]); #ifdef __AVX__ __m256 vlab, vxyz = { cielab_cbrt[_mm_extract_epi32(loadaddrh,1)], cielab_cbrt[_mm_extract_epi32(loadaddrh,0)], cielab_cbrt[_mm_extract_epi32(loadaddrh,1)], 0, cielab_cbrt[_mm_extract_epi32(loadaddrv,1)], cielab_cbrt[_mm_extract_epi32(loadaddrv,0)], cielab_cbrt[_mm_extract_epi32(loadaddrv,1)], 0}, vxyz2 = {0, cielab_cbrt[_mm_extract_epi32(loadaddrh,1)], cielab_cbrt[_mm_extract_epi32(loadaddrh,2)], cielab_cbrt[_mm_extract_epi32(loadaddrh,0)], 0, cielab_cbrt[_mm_extract_epi32(loadaddrv,1)], cielab_cbrt[_mm_extract_epi32(loadaddrv,2)], cielab_cbrt[_mm_extract_epi32(loadaddrv,0)]}; vlab = _mm256_sub_ps(vxyz,vxyz2); vlab = _mm256_mul_ps(vlab, _mm256_setr_ps(116,500,200,0,116,500,200,0)); vlab = _mm256_sub_ps(vlab, _mm256_setr_ps(16,0,0,0,16,0,0,0)); vlab = _mm256_mul_ps(vlab,_mm256_set1_ps(64)); vlab = _mm256_round_ps(vlab, _MM_FROUND_TO_ZERO); __m256i vlabi = _mm256_cvtps_epi32(vlab); return _mm_packs_epi32(_mm256_castsi256_si128(vlabi), ((__m128i*)&vlabi)[1]); #else __m128 vlabh, vxyzh = {cielab_cbrt[_mm_extract_epi32(loadaddrh,0)], cielab_cbrt[_mm_extract_epi32(loadaddrh,1)], cielab_cbrt[_mm_extract_epi32(loadaddrh,2)], 0}; __m128 vlabv, vxyzv = {cielab_cbrt[_mm_extract_epi32(loadaddrv,0)], cielab_cbrt[_mm_extract_epi32(loadaddrv,1)], cielab_cbrt[_mm_extract_epi32(loadaddrv,2)], 0}; vlabh = _mm_sub_ps(_mm_shuffle_ps(vxyzh,vxyzh,_MM_SHUFFLE(0,1,0,1)), _mm_shuffle_ps(vxyzh,vxyzh,_MM_SHUFFLE(0,2,1,3))); vlabh = _mm_mul_ps(vlabh,_mm_setr_ps(116,500,200,0)); vlabh = _mm_sub_ps(vlabh,_mm_setr_ps(16,0,0,0)); vlabh = _mm_mul_ps(vlabh,_mm_set_ps1(64)); vlabh = _mm_round_ps(vlabh, _MM_FROUND_TO_ZERO); vlabv = _mm_sub_ps(_mm_shuffle_ps(vxyzv,vxyzv,_MM_SHUFFLE(0,1,0,1)), _mm_shuffle_ps(vxyzv,vxyzv,_MM_SHUFFLE(0,2,1,3))); vlabv = _mm_mul_ps(vlabv,_mm_setr_ps(116,500,200,0)); vlabv = _mm_sub_ps(vlabv,_mm_setr_ps(16,0,0,0)); vlabv = _mm_mul_ps(vlabv,_mm_set_ps1(64)); vlabv = _mm_round_ps(vlabv, _MM_FROUND_TO_ZERO); return _mm_set_epi64(_mm_cvtps_pi16(vlabv),_mm_cvtps_pi16(vlabh)); #endif }
/** * @brief mux all audio ports to events * @param data * @param offset * @param nevents */ void AmdtpTransmitStreamProcessor::encodeAudioPortsFloat(quadlet_t *data, unsigned int offset, unsigned int nevents) { unsigned int j; quadlet_t *target_event; int i; float * client_buffers[4]; float tmp_values[4] __attribute__ ((aligned (16))); uint32_t tmp_values_int[4] __attribute__ ((aligned (16))); // prepare the scratch buffer assert(m_scratch_buffer_size_bytes > nevents * 4); memset(m_scratch_buffer, 0, nevents * 4); const __m128i label = _mm_set_epi32 (0x40000000, 0x40000000, 0x40000000, 0x40000000); const __m128i mask = _mm_set_epi32 (0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF); const __m128 mult = _mm_set_ps(AMDTP_FLOAT_MULTIPLIER, AMDTP_FLOAT_MULTIPLIER, AMDTP_FLOAT_MULTIPLIER, AMDTP_FLOAT_MULTIPLIER); #if AMDTP_CLIP_FLOATS const __m128 v_max = _mm_set_ps(1.0, 1.0, 1.0, 1.0); const __m128 v_min = _mm_set_ps(-1.0, -1.0, -1.0, -1.0); #endif // this assumes that audio ports are sorted by position, // and that there are no gaps for (i = 0; i < ((int)m_nb_audio_ports)-4; i += 4) { struct _MBLA_port_cache *p; // get the port buffers for (j=0; j<4; j++) { p = &(m_audio_ports.at(i+j)); if(likely(p->buffer && p->enabled)) { client_buffers[j] = (float *) p->buffer; client_buffers[j] += offset; } else { // if a port is disabled or has no valid // buffer, use the scratch buffer (all zero's) client_buffers[j] = (float *) m_scratch_buffer; } } // the base event for this position target_event = (quadlet_t *)(data + i); // process the events for (j=0;j < nevents; j += 1) { // read the values tmp_values[0] = *(client_buffers[0]); tmp_values[1] = *(client_buffers[1]); tmp_values[2] = *(client_buffers[2]); tmp_values[3] = *(client_buffers[3]); // now do the SSE based conversion/labeling __m128 v_float = *((__m128*)tmp_values); __m128i *target = (__m128i*)target_event; __m128i v_int; // clip #if AMDTP_CLIP_FLOATS // do SSE clipping v_float = _mm_max_ps(v_float, v_min); v_float = _mm_min_ps(v_float, v_max); #endif // multiply v_float = _mm_mul_ps(v_float, mult); // convert to signed integer v_int = _mm_cvttps_epi32( v_float ); // mask v_int = _mm_and_si128( v_int, mask ); // label it v_int = _mm_or_si128( v_int, label ); // do endian conversion (SSE is always little endian) // do first swap v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) ); // do second swap v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) ); // store the packed int // (target misalignment is assumed since we don't know the m_dimension) _mm_storeu_si128 (target, v_int); // increment the buffer pointers client_buffers[0]++; client_buffers[1]++; client_buffers[2]++; client_buffers[3]++; // go to next target event position target_event += m_dimension; } } // do remaining ports // NOTE: these can be time-SSE'd for (; i < (int)m_nb_audio_ports; i++) { struct _MBLA_port_cache &p = m_audio_ports.at(i); target_event = (quadlet_t *)(data + i); #ifdef DEBUG assert(nevents + offset <= p.buffer_size ); #endif if(likely(p.buffer && p.enabled)) { float *buffer = (float *)(p.buffer); buffer += offset; for (j = 0;j < nevents; j += 4) { // read the values tmp_values[0] = *buffer; buffer++; tmp_values[1] = *buffer; buffer++; tmp_values[2] = *buffer; buffer++; tmp_values[3] = *buffer; buffer++; // now do the SSE based conversion/labeling __m128 v_float = *((__m128*)tmp_values); __m128i v_int; #if AMDTP_CLIP_FLOATS // do SSE clipping v_float = _mm_max_ps(v_float, v_min); v_float = _mm_min_ps(v_float, v_max); #endif // multiply v_float = _mm_mul_ps(v_float, mult); // convert to signed integer v_int = _mm_cvttps_epi32( v_float ); // mask v_int = _mm_and_si128( v_int, mask ); // label it v_int = _mm_or_si128( v_int, label ); // do endian conversion (SSE is always little endian) // do first swap v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) ); // do second swap v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) ); // store the packed int _mm_store_si128 ((__m128i *)(&tmp_values_int), v_int); // increment the buffer pointers *target_event = tmp_values_int[0]; target_event += m_dimension; *target_event = tmp_values_int[1]; target_event += m_dimension; *target_event = tmp_values_int[2]; target_event += m_dimension; *target_event = tmp_values_int[3]; target_event += m_dimension; } // do the remainder of the events for(;j < nevents; j += 1) { float *in = (float *)buffer; #if AMDTP_CLIP_FLOATS // clip directly to the value of a maxed event if(unlikely(*in > 1.0)) { *target_event = CONDSWAPTOBUS32_CONST(0x407FFFFF); } else if(unlikely(*in < -1.0)) { *target_event = CONDSWAPTOBUS32_CONST(0x40800001); } else { float v = (*in) * AMDTP_FLOAT_MULTIPLIER; unsigned int tmp = ((int) v); tmp = ( tmp & 0x00FFFFFF ) | 0x40000000; *target_event = CondSwapToBus32((quadlet_t)tmp); } #else float v = (*in) * AMDTP_FLOAT_MULTIPLIER; unsigned int tmp = ((int) v); tmp = ( tmp & 0x00FFFFFF ) | 0x40000000; *target_event = CondSwapToBus32((quadlet_t)tmp); #endif buffer++; target_event += m_dimension; } } else { for (j = 0;j < nevents; j += 1) { // hardcoded byte swapped *target_event = 0x00000040; target_event += m_dimension; } } } }
float nv_vector_max(const nv_matrix_t *v, int j) { float v_max = -FLT_MAX; int i; #if NV_ENABLE_AVX { NV_ALIGNED(float, mm[9], 32); __m256 max_vec; int pk_lp = (v->n & 0xfffffff8); max_vec = _mm256_set1_ps(-FLT_MAX); for (i = 0; i < pk_lp; i += 8) { max_vec = _mm256_max_ps(max_vec, *(const __m256 *)&NV_MAT_V(v, j, i)); } _mm256_store_ps(mm, max_vec); for (i = pk_lp; i < v->n; ++i) { if (NV_MAT_V(v, j, i) > v_max) { v_max = NV_MAT_V(v, j, i); } } mm[8] = v_max; for (i = 0; i < 9; ++i) { if (mm[i] > v_max) { v_max = mm[i]; } } } #elif NV_ENABLE_SSE2 { NV_ALIGNED(float, mm[5], 16); __m128 max_vec; int pk_lp = (v->n & 0xfffffffc); max_vec = _mm_set1_ps(-FLT_MAX); for (i = 0; i < pk_lp; i += 4) { max_vec = _mm_max_ps(max_vec, *(const __m128 *)&NV_MAT_V(v, j, i)); } _mm_store_ps(mm, max_vec); for (i = pk_lp; i < v->n; ++i) { if (NV_MAT_V(v, j, i) > v_max) { v_max = NV_MAT_V(v, j, i); } } mm[4] = v_max; for (i = 0; i < 5; ++i) { if (mm[i] > v_max) { v_max = mm[i]; } } } #else for (i = 0; i < v->n; ++i) { if (NV_MAT_V(v, j, i) > v_max) { v_max = NV_MAT_V(v, j, i); } } #endif return v_max; }
void YUYVToRGB888(const XnUInt8* pYUVImage, XnUInt8* pRGBAImage, XnUInt32 nYUVSize, XnUInt32 nRGBSize) { const XnUInt8* pYUVLast = pYUVImage + nYUVSize - 8; XnUInt8* pRGBLast = pRGBAImage + nRGBSize - 16; const __m128 minus128 = _mm_set_ps1(-128); const __m128 plus113983 = _mm_set_ps1(1.13983F); const __m128 minus039466 = _mm_set_ps1(-0.39466F); const __m128 minus058060 = _mm_set_ps1(-0.58060F); const __m128 plus203211 = _mm_set_ps1(2.03211F); const __m128 zero = _mm_set_ps1(0); const __m128 plus255 = _mm_set_ps1(255); // define YUV floats __m128 y; __m128 u; __m128 v; __m128 temp; // define RGB floats __m128 r; __m128 g; __m128 b; // define RGB integers __m128i iR; __m128i iG; __m128i iB; XnUInt32* piR = (XnUInt32*)&iR; XnUInt32* piG = (XnUInt32*)&iG; XnUInt32* piB = (XnUInt32*)&iB; while (pYUVImage <= pYUVLast && pRGBAImage <= pRGBLast) { // process 4 pixels at once (values should be ordered backwards) y = _mm_set_ps(pYUVImage[YUYV_Y2 + YUYV_BPP], pYUVImage[YUYV_Y1 + YUYV_BPP], pYUVImage[YUYV_Y2], pYUVImage[YUYV_Y1]); u = _mm_set_ps(pYUVImage[YUYV_U + YUYV_BPP], pYUVImage[YUYV_U + YUYV_BPP], pYUVImage[YUYV_U], pYUVImage[YUYV_U]); v = _mm_set_ps(pYUVImage[YUYV_V + YUYV_BPP], pYUVImage[YUYV_V + YUYV_BPP], pYUVImage[YUYV_V], pYUVImage[YUYV_V]); u = _mm_add_ps(u, minus128); // u -= 128 v = _mm_add_ps(v, minus128); // v -= 128 /* http://en.wikipedia.org/wiki/YUV From YUV to RGB: R = Y + 1.13983 V G = Y - 0.39466 U - 0.58060 V B = Y + 2.03211 U */ temp = _mm_mul_ps(plus113983, v); r = _mm_add_ps(y, temp); temp = _mm_mul_ps(minus039466, u); g = _mm_add_ps(y, temp); temp = _mm_mul_ps(minus058060, v); g = _mm_add_ps(g, temp); temp = _mm_mul_ps(plus203211, u); b = _mm_add_ps(y, temp); // make sure no value is smaller than 0 r = _mm_max_ps(r, zero); g = _mm_max_ps(g, zero); b = _mm_max_ps(b, zero); // make sure no value is bigger than 255 r = _mm_min_ps(r, plus255); g = _mm_min_ps(g, plus255); b = _mm_min_ps(b, plus255); // convert floats to int16 (there is no conversion to uint8, just to int8). iR = _mm_cvtps_epi32(r); iG = _mm_cvtps_epi32(g); iB = _mm_cvtps_epi32(b); // extract the 4 pixels RGB values. // because we made sure values are between 0 and 255, we can just take the lower byte // of each INT16 pRGBAImage[0] = piR[0]; pRGBAImage[1] = piG[0]; pRGBAImage[2] = piB[0]; pRGBAImage[3] = 255; pRGBAImage[4] = piR[1]; pRGBAImage[5] = piG[1]; pRGBAImage[6] = piB[1]; pRGBAImage[7] = 255; pRGBAImage[8] = piR[2]; pRGBAImage[9] = piG[2]; pRGBAImage[10] = piB[2]; pRGBAImage[11] = 255; pRGBAImage[12] = piR[3]; pRGBAImage[13] = piG[3]; pRGBAImage[14] = piB[3]; pRGBAImage[15] = 255; // advance the streams pYUVImage += 8; pRGBAImage += 16; } }
/* natural logarithm computed for 4 simultaneous float return NaN for x <= 0 */ __m128 log_ps(v4sfu *xPtr) { __m128 x=*((__m128 *)xPtr); #ifdef USE_SSE2 __m128i emm0; #else __m64 mm0, mm1; #endif __m128 one = *(__m128*)_ps_1; __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps()); x = _mm_max_ps(x, *(__m128*)_ps_min_norm_pos); /* cut off denormalized stuff */ #ifndef USE_SSE2 /* part 1: x = frexpf(x, &e); */ COPY_XMM_TO_MM(x, mm0, mm1); mm0 = _mm_srli_pi32(mm0, 23); mm1 = _mm_srli_pi32(mm1, 23); #else emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23); #endif /* keep only the fractional part */ x = _mm_and_ps(x, *(__m128*)_ps_inv_mant_mask); x = _mm_or_ps(x, *(__m128*)_ps_0p5); #ifndef USE_SSE2 /* now e=mm0:mm1 contain the really base-2 exponent */ mm0 = _mm_sub_pi32(mm0, *(__m64*)_pi32_0x7f); mm1 = _mm_sub_pi32(mm1, *(__m64*)_pi32_0x7f); __m128 e = _mm_cvtpi32x2_ps(mm0, mm1); _mm_empty(); /* bye bye mmx */ #else emm0 = _mm_sub_epi32(emm0, *(__m128i*)_pi32_0x7f); __m128 e = _mm_cvtepi32_ps(emm0); #endif e = _mm_add_ps(e, one); /* part2: if( x < SQRTHF ) { e -= 1; x = x + x - 1.0; } else { x = x - 1.0; } */ __m128 mask = _mm_cmplt_ps(x, *(__m128*)_ps_cephes_SQRTHF); __m128 tmp = _mm_and_ps(x, mask); x = _mm_sub_ps(x, one); e = _mm_sub_ps(e, _mm_and_ps(one, mask)); x = _mm_add_ps(x, tmp); __m128 z = _mm_mul_ps(x,x); __m128 y = *(__m128*)_ps_cephes_log_p0; y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p1); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p2); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p3); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p4); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p5); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p6); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p7); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p8); y = _mm_mul_ps(y, x); y = _mm_mul_ps(y, z); tmp = _mm_mul_ps(e, *(__m128*)_ps_cephes_log_q1); y = _mm_add_ps(y, tmp); tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5); y = _mm_sub_ps(y, tmp); tmp = _mm_mul_ps(e, *(__m128*)_ps_cephes_log_q2); x = _mm_add_ps(x, y); x = _mm_add_ps(x, tmp); x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN return x; }