Exemplos de _mm_max_ps em C++ (Cpp)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: shapedescr.cpp Projeto: Brigadier1-9/node-dv

/* Calculates bounding rectagnle of a point set or retrieves already calculated */
CV_IMPL  CvRect
cvBoundingRect( CvArr* array, int update )
{
    CvSeqReader reader;
    CvRect  rect = { 0, 0, 0, 0 };
    CvContour contour_header;
    CvSeq* ptseq = 0;
    CvSeqBlock block;

    CvMat stub, *mat = 0;
    int  xmin = 0, ymin = 0, xmax = -1, ymax = -1, i, j, k;
    int calculate = update;

    if( CV_IS_SEQ( array ))
    {
        ptseq = (CvSeq*)array;
        if( !CV_IS_SEQ_POINT_SET( ptseq ))
            CV_Error( CV_StsBadArg, "Unsupported sequence type" );

        if( ptseq->header_size < (int)sizeof(CvContour))
        {
            update = 0;
            calculate = 1;
        }
    }
    else
    {
        mat = cvGetMat( array, &stub );
        if( CV_MAT_TYPE(mat->type) == CV_32SC2 ||
            CV_MAT_TYPE(mat->type) == CV_32FC2 )
        {
            ptseq = cvPointSeqFromMat(CV_SEQ_KIND_GENERIC, mat, &contour_header, &block);
            mat = 0;
        }
        else if( CV_MAT_TYPE(mat->type) != CV_8UC1 &&
                CV_MAT_TYPE(mat->type) != CV_8SC1 )
            CV_Error( CV_StsUnsupportedFormat,
                "The image/matrix format is not supported by the function" );
        update = 0;
        calculate = 1;
    }

    if( !calculate )
        return ((CvContour*)ptseq)->rect;

    if( mat )
    {
        CvSize size = cvGetMatSize(mat);
        xmin = size.width;
        ymin = -1;

        for( i = 0; i < size.height; i++ )
        {
            uchar* _ptr = mat->data.ptr + i*mat->step;
            uchar* ptr = (uchar*)cvAlignPtr(_ptr, 4);
            int have_nz = 0, k_min, offset = (int)(ptr - _ptr);
            j = 0;
            offset = MIN(offset, size.width);
            for( ; j < offset; j++ )
                if( _ptr[j] )
                {
                    have_nz = 1;
                    break;
                }
            if( j < offset )
            {
                if( j < xmin )
                    xmin = j;
                if( j > xmax )
                    xmax = j;
            }
            if( offset < size.width )
            {
                xmin -= offset;
                xmax -= offset;
                size.width -= offset;
                j = 0;
                for( ; j <= xmin - 4; j += 4 )
                    if( *((int*)(ptr+j)) )
                        break;
                for( ; j < xmin; j++ )
                    if( ptr[j] )
                    {
                        xmin = j;
                        if( j > xmax )
                            xmax = j;
                        have_nz = 1;
                        break;
                    }
                k_min = MAX(j-1, xmax);
                k = size.width - 1;
                for( ; k > k_min && (k&3) != 3; k-- )
                    if( ptr[k] )
                        break;
                if( k > k_min && (k&3) == 3 )
                {
                    for( ; k > k_min+3; k -= 4 )
                        if( *((int*)(ptr+k-3)) )
                            break;
                }
                for( ; k > k_min; k-- )
                    if( ptr[k] )
                    {
                        xmax = k;
                        have_nz = 1;
                        break;
                    }
                if( !have_nz )
                {
                    j &= ~3;
                    for( ; j <= k - 3; j += 4 )
                        if( *((int*)(ptr+j)) )
                            break;
                    for( ; j <= k; j++ )
                        if( ptr[j] )
                        {
                            have_nz = 1;
                            break;
                        }
                }
                xmin += offset;
                xmax += offset;
                size.width += offset;
            }
            if( have_nz )
            {
                if( ymin < 0 )
                    ymin = i;
                ymax = i;
            }
        }

        if( xmin >= size.width )
            xmin = ymin = 0;
    }
    else if( ptseq->total )
    {
        int  is_float = CV_SEQ_ELTYPE(ptseq) == CV_32FC2;
        cvStartReadSeq( ptseq, &reader, 0 );
        CvPoint pt;
        CV_READ_SEQ_ELEM( pt, reader );
    #if CV_SSE4_2
        if(cv::checkHardwareSupport(CV_CPU_SSE4_2))
        {
            if( !is_float )
            {
                __m128i minval, maxval;
                minval = maxval = _mm_loadl_epi64((const __m128i*)(&pt)); //min[0]=pt.x, min[1]=pt.y

                for( i = 1; i < ptseq->total; i++)
                {
                    __m128i ptXY = _mm_loadl_epi64((const __m128i*)(reader.ptr));
                    CV_NEXT_SEQ_ELEM(sizeof(pt), reader);
                    minval = _mm_min_epi32(ptXY, minval);
                    maxval = _mm_max_epi32(ptXY, maxval);
                }
                xmin = _mm_cvtsi128_si32(minval);
                ymin = _mm_cvtsi128_si32(_mm_srli_si128(minval, 4));
                xmax = _mm_cvtsi128_si32(maxval);
                ymax = _mm_cvtsi128_si32(_mm_srli_si128(maxval, 4));
            }
            else
            {
                __m128 minvalf, maxvalf, z = _mm_setzero_ps(), ptXY = _mm_setzero_ps();
                minvalf = maxvalf = _mm_loadl_pi(z, (const __m64*)(&pt));

                for( i = 1; i < ptseq->total; i++ )
                {
                    ptXY = _mm_loadl_pi(ptXY, (const __m64*)reader.ptr);
                    CV_NEXT_SEQ_ELEM(sizeof(pt), reader);

                    minvalf = _mm_min_ps(minvalf, ptXY);
                    maxvalf = _mm_max_ps(maxvalf, ptXY);
                }

                float xyminf[2], xymaxf[2];
                _mm_storel_pi((__m64*)xyminf, minvalf);
                _mm_storel_pi((__m64*)xymaxf, maxvalf);
                xmin = cvFloor(xyminf[0]);
                ymin = cvFloor(xyminf[1]);
                xmax = cvFloor(xymaxf[0]);
                ymax = cvFloor(xymaxf[1]);
            }
        }
        else
    #endif
        {
            if( !is_float )
            {
                xmin = xmax = pt.x;
                ymin = ymax = pt.y;

                for( i = 1; i < ptseq->total; i++ )
                {
                    CV_READ_SEQ_ELEM( pt, reader );

                    if( xmin > pt.x )
                        xmin = pt.x;

                    if( xmax < pt.x )
                        xmax = pt.x;

                    if( ymin > pt.y )
                        ymin = pt.y;

                    if( ymax < pt.y )
                        ymax = pt.y;
                }
            }
            else
            {
                Cv32suf v;
                // init values
                xmin = xmax = CV_TOGGLE_FLT(pt.x);
                ymin = ymax = CV_TOGGLE_FLT(pt.y);

                for( i = 1; i < ptseq->total; i++ )
                {
                    CV_READ_SEQ_ELEM( pt, reader );
                    pt.x = CV_TOGGLE_FLT(pt.x);
                    pt.y = CV_TOGGLE_FLT(pt.y);

                    if( xmin > pt.x )
                        xmin = pt.x;

                    if( xmax < pt.x )
                        xmax = pt.x;

                    if( ymin > pt.y )
                        ymin = pt.y;

                    if( ymax < pt.y )
                        ymax = pt.y;
                }

                v.i = CV_TOGGLE_FLT(xmin); xmin = cvFloor(v.f);
                v.i = CV_TOGGLE_FLT(ymin); ymin = cvFloor(v.f);
                // because right and bottom sides of the bounding rectangle are not inclusive
                // (note +1 in width and height calculation below), cvFloor is used here instead of cvCeil
                v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f);
                v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f);
            }
        }
        rect.x = xmin;
        rect.y = ymin;
        rect.width = xmax - xmin + 1;
        rect.height = ymax - ymin + 1;
    }
    if( update )
        ((CvContour*)ptseq)->rect = rect;
    return rect;
}

Exemplo n.º 2

0

Exibir arquivo

Arquivo: shapedescr.cpp Projeto: 4auka/opencv

// Calculates bounding rectagnle of a point set or retrieves already calculated
static Rect pointSetBoundingRect( const Mat& points )
{
    int npoints = points.checkVector(2);
    int depth = points.depth();
    CV_Assert(npoints >= 0 && (depth == CV_32F || depth == CV_32S));

    int  xmin = 0, ymin = 0, xmax = -1, ymax = -1, i;
    bool is_float = depth == CV_32F;

    if( npoints == 0 )
        return Rect();

    const Point* pts = (const Point*)points.data;
    Point pt = pts[0];

#if CV_SSE4_2
    if(cv::checkHardwareSupport(CV_CPU_SSE4_2))
    {
        if( !is_float )
        {
            __m128i minval, maxval;
            minval = maxval = _mm_loadl_epi64((const __m128i*)(&pt)); //min[0]=pt.x, min[1]=pt.y

            for( i = 1; i < npoints; i++ )
            {
                __m128i ptXY = _mm_loadl_epi64((const __m128i*)&pts[i]);
                minval = _mm_min_epi32(ptXY, minval);
                maxval = _mm_max_epi32(ptXY, maxval);
            }
            xmin = _mm_cvtsi128_si32(minval);
            ymin = _mm_cvtsi128_si32(_mm_srli_si128(minval, 4));
            xmax = _mm_cvtsi128_si32(maxval);
            ymax = _mm_cvtsi128_si32(_mm_srli_si128(maxval, 4));
        }
        else
        {
            __m128 minvalf, maxvalf, z = _mm_setzero_ps(), ptXY = _mm_setzero_ps();
            minvalf = maxvalf = _mm_loadl_pi(z, (const __m64*)(&pt));

            for( i = 1; i < npoints; i++ )
            {
                ptXY = _mm_loadl_pi(ptXY, (const __m64*)&pts[i]);

                minvalf = _mm_min_ps(minvalf, ptXY);
                maxvalf = _mm_max_ps(maxvalf, ptXY);
            }

            float xyminf[2], xymaxf[2];
            _mm_storel_pi((__m64*)xyminf, minvalf);
            _mm_storel_pi((__m64*)xymaxf, maxvalf);
            xmin = cvFloor(xyminf[0]);
            ymin = cvFloor(xyminf[1]);
            xmax = cvFloor(xymaxf[0]);
            ymax = cvFloor(xymaxf[1]);
        }
    }
    else
#endif
    {
        if( !is_float )
        {
            xmin = xmax = pt.x;
            ymin = ymax = pt.y;

            for( i = 1; i < npoints; i++ )
            {
                pt = pts[i];

                if( xmin > pt.x )
                    xmin = pt.x;

                if( xmax < pt.x )
                    xmax = pt.x;

                if( ymin > pt.y )
                    ymin = pt.y;

                if( ymax < pt.y )
                    ymax = pt.y;
            }
        }
        else
        {
            Cv32suf v;
            // init values
            xmin = xmax = CV_TOGGLE_FLT(pt.x);
            ymin = ymax = CV_TOGGLE_FLT(pt.y);

            for( i = 1; i < npoints; i++ )
            {
                pt = pts[i];
                pt.x = CV_TOGGLE_FLT(pt.x);
                pt.y = CV_TOGGLE_FLT(pt.y);

                if( xmin > pt.x )
                    xmin = pt.x;

                if( xmax < pt.x )
                    xmax = pt.x;

                if( ymin > pt.y )
                    ymin = pt.y;

                if( ymax < pt.y )
                    ymax = pt.y;
            }

            v.i = CV_TOGGLE_FLT(xmin); xmin = cvFloor(v.f);
            v.i = CV_TOGGLE_FLT(ymin); ymin = cvFloor(v.f);
            // because right and bottom sides of the bounding rectangle are not inclusive
            // (note +1 in width and height calculation below), cvFloor is used here instead of cvCeil
            v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f);
            v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f);
        }
    }

    return Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1);
}

Exemplo n.º 3

0

Exibir arquivo

Arquivo: brushtool.cpp Projeto: BuildandShoot/terravox

void BrushToolEdit::drawInner(const QPoint &pt, float strength)
{
    float fixedStrength = params.strength;
    strength *= fixedStrength;

    auto color = params.color;
    std::array<int, 3> colorParts = Terrain::expandColor(color);
    __m128 colorMM = _mm_setr_ps(colorParts[0], colorParts[1], colorParts[2], 0);

    SseRoundingModeScope roundingModeScope(_MM_ROUND_NEAREST);
    (void) roundingModeScope;

    switch (tool->type()) {
    case BrushType::Blur:
        drawBlur(pt, std::min(strength / 5.f, 4.f));
        break;
    case BrushType::Smoothen:
        drawSmoothen(pt, std::min(strength / 5.f, 4.f));
        break;
    case BrushType::Raise:
    case BrushType::Lower:
        if (tool->type() == BrushType::Lower) {
            fixedStrength = -fixedStrength;
            strength = -strength;
        }
        switch (params.pressureMode) {
        case BrushPressureMode::AirBrush:
            strength *= 3.f;
            drawRaiseLower(pt, [=](float &current, float before, float tip) {
                (void) before;
                current -= tip * strength;
            });
            break;
        case BrushPressureMode::Constant:
            if (tool->type() == BrushType::Lower) {
                drawRaiseLower(pt, [=](float &current, float before, float tip) {
                    current = Terrain::quantizeOne(std::max(current, before - tip * fixedStrength));
                });
            } else {
                drawRaiseLower(pt, [=](float &current, float before, float tip) {
                    current = Terrain::quantizeOne(std::min(current, before - tip * fixedStrength));
                });
            }
            break;
        case BrushPressureMode::Adjustable:
            drawRaiseLower(pt, [=](float &current, float before, float tip) {
                current = Terrain::quantizeOne(before - tip * strength);
            });
            break;
        }
        break;
    case BrushType::Paint:
        switch (params.pressureMode) {
        case BrushPressureMode::AirBrush:
            strength = 1.f - std::exp2(-strength);

            drawColor(pt, [=](quint32 &current, quint32 before, float tip) {
                (void) before;

                // convert current color to FP32
                auto currentMM = _mm_castps_si128(_mm_load_ss(reinterpret_cast<float *>(&current)));
                currentMM = _mm_unpacklo_epi8(currentMM, _mm_setzero_si128());
                currentMM = _mm_unpacklo_epi16(currentMM, _mm_setzero_si128());
                auto currentMF = _mm_cvtepi32_ps(currentMM);

                auto factor = _mm_set1_ps(tip * strength);

                // blend
                auto diff = _mm_sub_ps(colorMM, currentMF);
                diff = _mm_mul_ps(diff, factor);
                currentMF = _mm_add_ps(currentMF, diff);

                // convert to RGB32
                currentMF = _mm_add_ps(currentMF, globalDitherSampler.getM128());
                currentMM = _mm_cvttps_epi32(currentMF);
                currentMM = _mm_packs_epi32(currentMM, currentMM);
                currentMM = _mm_packus_epi16(currentMM, currentMM);

                _mm_store_ss(reinterpret_cast<float *>(&current), _mm_castsi128_ps(currentMM));
            });
            break;
        case BrushPressureMode::Constant:
            fixedStrength *= 0.01f;
            drawColor(pt, [=](quint32 &current, quint32 before, float tip) {
                // convert current color to FP32
                auto currentMM = _mm_castps_si128(_mm_load_ss(reinterpret_cast<float *>(&current)));
                currentMM = _mm_unpacklo_epi8(currentMM, _mm_setzero_si128());
                currentMM = _mm_unpacklo_epi16(currentMM, _mm_setzero_si128());
                auto currentMF = _mm_cvtepi32_ps(currentMM);

                // convert before color to FP32
                auto beforeMM = _mm_setr_epi32(before, 0, 0, 0);
                beforeMM = _mm_unpacklo_epi8(beforeMM, _mm_setzero_si128());
                beforeMM = _mm_unpacklo_epi16(beforeMM, _mm_setzero_si128());
                auto beforeMF = _mm_cvtepi32_ps(beforeMM);
                // beforeMM = _mm_add_ps(beforeMM, globalDitherSampler.getM128());

                // use "before" image to which way of color change is possible, and
                // compute possible range of result color
                auto diff = _mm_sub_ps(colorMM, beforeMF);
                auto factor = _mm_set1_ps(tip * fixedStrength);
                auto adddiff = _mm_mul_ps(diff, factor);
                beforeMF = _mm_add_ps(beforeMF, adddiff);
                auto diffDir = _mm_cmpgt_ps(diff, _mm_setzero_ps());

                // compute output image
                auto out1 = _mm_max_ps(currentMF, beforeMF);
                auto out2 = _mm_min_ps(currentMF, beforeMF);
                currentMF = _mm_or_ps(_mm_and_ps(diffDir, out1), _mm_andnot_ps(diffDir, out2));

                // convert to RGB32
                currentMF = _mm_add_ps(currentMF, globalDitherSampler.getM128());
                currentMM = _mm_cvttps_epi32(currentMF);
                currentMM = _mm_packs_epi32(currentMM, currentMM);
                currentMM = _mm_packus_epi16(currentMM, currentMM);

                _mm_store_ss(reinterpret_cast<float *>(&current), _mm_castsi128_ps(currentMM));
            });
            break;
        case BrushPressureMode::Adjustable:
            strength *= 0.01f;
            drawColor(pt, [=](quint32 &current, quint32 before, float tip) {

                // convert before color to FP32
                auto beforeMM = _mm_setr_epi32(before, 0, 0, 0);
                beforeMM = _mm_unpacklo_epi8(beforeMM, _mm_setzero_si128());
                beforeMM = _mm_unpacklo_epi16(beforeMM, _mm_setzero_si128());
                auto beforeMF = _mm_cvtepi32_ps(beforeMM);

                // blend
                auto diff = _mm_sub_ps(colorMM, beforeMF);
                auto factor = _mm_set1_ps(tip * strength);
                diff = _mm_mul_ps(diff, factor);
                beforeMF = _mm_add_ps(beforeMF, diff);

                // convert to RGB32
                beforeMF = _mm_add_ps(beforeMF, globalDitherSampler.getM128());
                beforeMM = _mm_cvttps_epi32(beforeMF);
                beforeMM = _mm_packs_epi32(beforeMM, beforeMM);
                beforeMM = _mm_packus_epi16(beforeMM, beforeMM);

                _mm_store_ss(reinterpret_cast<float *>(&current), _mm_castsi128_ps(beforeMM));
            });
            break;
        }
        break;
    }

}

Exemplo n.º 4

0

Exibir arquivo

Arquivo: ManagerBitSet.cpp Projeto: aonorin/pipeline

    dp::math::Box3f ManagerBitSet::calculateBoundingBox( const GroupSharedPtr& group ) const
    {
#if defined(SSE)
      if ( useSSE )
      {
        GroupBitSetSharedPtr groupImpl = std::static_pointer_cast<GroupBitSet>(group);

        __m128 minValue = _mm_set1_ps( std::numeric_limits<float>::signaling_NaN() );
        __m128 maxValue = _mm_set1_ps( std::numeric_limits<float>::signaling_NaN() );

        char const* basePtr = reinterpret_cast<char const*>(groupImpl->getMatrices());
        for ( size_t index = 0;index < groupImpl->getObjectCount(); ++index )
        {
          ObjectBitSetSharedPtr objectImpl = std::static_pointer_cast<ObjectBitSet>(groupImpl->getObject( index ));
          dp::math::sse::Mat44f const& modelView = *reinterpret_cast<dp::math::sse::Mat44f const*>(basePtr + objectImpl->getTransformIndex() * groupImpl->getMatricesStride());
          dp::math::Vec4f const& extent = objectImpl->getExtent();

          dp::math::sse::Vec4f vectors[8];
          vectors[0] = *reinterpret_cast<dp::math::sse::Vec4f const*>(&objectImpl->getLowerLeft()) * modelView;

          dp::math::sse::Vec4f x( extent[0] * modelView[0] );
          dp::math::sse::Vec4f y( extent[1] * modelView[1] );
          dp::math::sse::Vec4f z( extent[2] * modelView[2] );

          vectors[1] = vectors[0] + x;
          vectors[2] = vectors[0] + y;
          vectors[3] = vectors[1] + y;
          vectors[4] = vectors[0] + z;
          vectors[5] = vectors[1] + z;
          vectors[6] = vectors[2] + z;
          vectors[7] = vectors[3] + z;

          for ( unsigned int i = 0;i < 8; ++i )
          {
            minValue = _mm_min_ps( minValue, vectors[i].sse() );
            maxValue = _mm_max_ps( maxValue, vectors[i].sse() );
          }
        }

        dp::math::Vec3f minVec, maxVec;
        _MM_EXTRACT_FLOAT( minVec[0], minValue, 0);
        _MM_EXTRACT_FLOAT( minVec[1], minValue, 1);
        _MM_EXTRACT_FLOAT( minVec[2], minValue, 2);

        _MM_EXTRACT_FLOAT( maxVec[0], maxValue, 0);
        _MM_EXTRACT_FLOAT( maxVec[1], maxValue, 1);
        _MM_EXTRACT_FLOAT( maxVec[2], maxValue, 2);

        return dp::math::Box3f( minVec, maxVec );
      }
      else
#elif defined(NEON)
        if ( useNEON )
        {
          GroupBitSetSharedPtr groupImpl = std::static_pointer_cast<GroupBitSet>(group);

          float32x4_t minValue = vdupq_n_f32( std::numeric_limits<float>::max() );
          float32x4_t maxValue = vdupq_n_f32( -std::numeric_limits<float>::max() );

          char const* basePtr = reinterpret_cast<char const*>(groupImpl->getMatrices());
          for ( size_t index = 0;index < groupImpl->getObjectCount(); ++index )
          {
            const ObjectBitSetSharedPtr objectImpl = std::static_pointer_cast<ObjectBitSet>(groupImpl->getObject( index ));
            dp::math::neon::Mat44f const& modelView = *reinterpret_cast<dp::math::neon::Mat44f const*>(basePtr + objectImpl->getTransformIndex() * groupImpl->getMatricesStride());
            dp::math::Vec4f const& extent = objectImpl->getExtent();

            dp::math::neon::Vec4f vectors[8];
            vectors[0] = *reinterpret_cast<dp::math::neon::Vec4f const*>(&objectImpl->getLowerLeft()) * modelView;

            dp::math::neon::Vec4f x( extent[0] * modelView[0] );
            dp::math::neon::Vec4f y( extent[1] * modelView[1] );
            dp::math::neon::Vec4f z( extent[2] * modelView[2] );

            vectors[1] = vectors[0] + x;
            vectors[2] = vectors[0] + y;
            vectors[3] = vectors[1] + y;
            vectors[4] = vectors[0] + z;
            vectors[5] = vectors[1] + z;
            vectors[6] = vectors[2] + z;
            vectors[7] = vectors[3] + z;

            for ( unsigned int i = 0;i < 8; ++i )
            {
              minValue = vminq_f32( minValue, vectors[i].neon() );
              maxValue = vmaxq_f32( maxValue, vectors[i].neon() );
            }

          }

          dp::math::Vec3f minVec, maxVec;

          vst1q_lane_f32( &minVec[0], minValue, 0);
          vst1q_lane_f32( &minVec[1], minValue, 1);
          vst1q_lane_f32( &minVec[2], minValue, 2);

          vst1q_lane_f32( &maxVec[0], maxValue, 0);
          vst1q_lane_f32( &maxVec[1], maxValue, 1);
          vst1q_lane_f32( &maxVec[2], maxValue, 2);

          return dp::math::Box3f( minVec, maxVec );
        }
        else

#endif
      // CPU fallback
      {
        GroupBitSetSharedPtr groupImpl = std::static_pointer_cast<GroupBitSet>(group);

        dp::math::Box4f boundingBox;

        char const* basePtr = reinterpret_cast<char const*>(groupImpl->getMatrices());
        for ( size_t index = 0;index < groupImpl->getObjectCount(); ++index )
        {
          const ObjectBitSetSharedPtr objectImpl = std::static_pointer_cast<ObjectBitSet>(groupImpl->getObject( index ));
          dp::math::Mat44f const& modelView = reinterpret_cast<dp::math::Mat44f const&>(*(basePtr + objectImpl->getTransformIndex() * groupImpl->getMatricesStride()));
          dp::math::Vec4f const& extent = objectImpl->getExtent();

          dp::math::Vec4f vectors[8];
          vectors[0] = (objectImpl->getLowerLeft() * modelView);

          dp::math::Vec4f x( extent[0] * modelView.getPtr()[0], extent[0] * modelView.getPtr()[1], extent[0] * modelView.getPtr()[2], extent[0] * modelView.getPtr()[3] );
          dp::math::Vec4f y( extent[1] * modelView.getPtr()[4], extent[1] * modelView.getPtr()[5], extent[1] * modelView.getPtr()[6], extent[1] * modelView.getPtr()[7] );
          dp::math::Vec4f z( extent[2] * modelView.getPtr()[8], extent[2] * modelView.getPtr()[9], extent[2] * modelView.getPtr()[10], extent[2] * modelView.getPtr()[11] );

          vectors[1] = vectors[0] + x;
          vectors[2] = vectors[0] + y;
          vectors[3] = vectors[1] + y;
          vectors[4] = vectors[0] + z;
          vectors[5] = vectors[1] + z;
          vectors[6] = vectors[2] + z;
          vectors[7] = vectors[3] + z;

          for ( unsigned int i = 0;i < 8; ++i )
          {
            boundingBox.update( vectors[i] );
          }
        }

        dp::math::Vec4f lower = boundingBox.getLower();
        dp::math::Vec4f upper = boundingBox.getUpper();

        return dp::math::Box3f( dp::math::Vec3f( lower[0], lower[1], lower[2]), dp::math::Vec3f( upper[0], upper[1], upper[2]));
      }
    }

Exemplo n.º 5

0

Exibir arquivo

Arquivo: dspXMM.cpp Projeto: songo/muse

void
x86_sse_find_peaks(float *buf, unsigned nframes, float *min, float *max)
{
    __m128 current_max, current_min, work;

    // Load max and min values into all four slots of the XMM registers
    current_min = _mm_set1_ps(*min);
    current_max = _mm_set1_ps(*max);

    // Work input until "buf" reaches 16 byte alignment
    while ( ((unsigned long)buf) % 16 != 0 && nframes > 0) {

        // Load the next float into the work buffer
        work = _mm_set1_ps(*buf);

        current_min = _mm_min_ps(current_min, work);
        current_max = _mm_max_ps(current_max, work);

        buf++;
        nframes--;
    }

    // use 64 byte prefetch for quadruple quads
    while (nframes >= 16) {
        __builtin_prefetch(buf+64,0,0);

        work = _mm_load_ps(buf);
        current_min = _mm_min_ps(current_min, work);
        current_max = _mm_max_ps(current_max, work);
        buf+=4;
        work = _mm_load_ps(buf);
        current_min = _mm_min_ps(current_min, work);
        current_max = _mm_max_ps(current_max, work);
        buf+=4;
        work = _mm_load_ps(buf);
        current_min = _mm_min_ps(current_min, work);
        current_max = _mm_max_ps(current_max, work);
        buf+=4;
        work = _mm_load_ps(buf);
        current_min = _mm_min_ps(current_min, work);
        current_max = _mm_max_ps(current_max, work);
        buf+=4;
        nframes-=16;
    }

    // work through aligned buffers
    while (nframes >= 4) {

        work = _mm_load_ps(buf);

        current_min = _mm_min_ps(current_min, work);
        current_max = _mm_max_ps(current_max, work);

        buf+=4;
        nframes-=4;
    }

    // work through the rest < 4 samples
    while ( nframes > 0) {

        // Load the next float into the work buffer
        work = _mm_set1_ps(*buf);

        current_min = _mm_min_ps(current_min, work);
        current_max = _mm_max_ps(current_max, work);

        buf++;
        nframes--;
    }

    // Find min & max value in current_max through shuffle tricks

    work = current_min;
    work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1));
    work = _mm_min_ps (work, current_min);
    current_min = work;
    work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2));
    work = _mm_min_ps (work, current_min);

    _mm_store_ss(min, work);

    work = current_max;
    work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1));
    work = _mm_max_ps (work, current_max);
    current_max = work;
    work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2));
    work = _mm_max_ps (work, current_max);

    _mm_store_ss(max, work);
}

Exemplo n.º 6

0

Exibir arquivo

Arquivo: lightdesc.cpp Projeto: paralin/hl2sdk

void LightDesc_t::ComputeLightAtPoints( const FourVectors &pos, const FourVectors &normal,
										FourVectors &color, bool DoHalfLambert ) const
{
	FourVectors delta;
	
	Assert((m_Type==MATERIAL_LIGHT_POINT) || (m_Type==MATERIAL_LIGHT_SPOT) || (m_Type==MATERIAL_LIGHT_DIRECTIONAL));
	switch (m_Type)
	{
		case MATERIAL_LIGHT_POINT:
		case MATERIAL_LIGHT_SPOT:
			delta.DuplicateVector(m_Position);
			delta-=pos;
			break;
				
		case MATERIAL_LIGHT_DIRECTIONAL:
			delta.DuplicateVector(m_Direction);
			delta*=-1.0;
			break;
			
		default:
			delta.x = Four_Zeros;
			delta.y = Four_Zeros;
			delta.z = Four_Zeros;
			break;
	}

	__m128 dist2 = delta*delta;

	__m128 falloff;

	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION0 )
	{
		falloff = MMReplicate(m_Attenuation0);
	}
	else
		falloff= Four_Epsilons;

	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION1 )
	{
		falloff=_mm_add_ps(falloff,_mm_mul_ps(MMReplicate(m_Attenuation1),_mm_sqrt_ps(dist2)));
	}

	if( m_Flags & LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION2 )
	{
		falloff=_mm_add_ps(falloff,_mm_mul_ps(MMReplicate(m_Attenuation2),dist2));
	}

	falloff=_mm_rcp_ps(falloff);
	// Cull out light beyond this radius
	// now, zero out elements for which dist2 was > range^2. !!speed!! lights should store dist^2 in sse format
	if (m_Range != 0.f)
	{
		__m128 RangeSquared=MMReplicate(m_RangeSquared); // !!speed!!
		falloff=_mm_and_ps(falloff,_mm_cmplt_ps(dist2,RangeSquared));
	}

	delta.VectorNormalizeFast();
	__m128 strength=delta*normal;
	if (DoHalfLambert)
	{
		strength=_mm_add_ps(_mm_mul_ps(strength,Four_PointFives),Four_PointFives);
	}
	else
		strength=_mm_max_ps(Four_Zeros,delta*normal);
		
	switch(m_Type)
	{
		case MATERIAL_LIGHT_POINT:
			// half-lambert
			break;
				
		case MATERIAL_LIGHT_SPOT:
		{
			__m128 dot2=_mm_sub_ps(Four_Zeros,delta*m_Direction); // dot position with spot light dir for cone falloff


			__m128 cone_falloff_scale=_mm_mul_ps(MMReplicate(OneOver_ThetaDot_Minus_PhiDot),
												 _mm_sub_ps(dot2,MMReplicate(m_PhiDot)));
			cone_falloff_scale=_mm_min_ps(cone_falloff_scale,Four_Ones);
			
			if ((m_Falloff!=0.0) && (m_Falloff!=1.0))
			{
				// !!speed!! could compute integer exponent needed by powsse and store in light
				cone_falloff_scale=PowSSE(cone_falloff_scale,m_Falloff);
			}
			strength=_mm_mul_ps(cone_falloff_scale,strength);

			// now, zero out lighting where dot2<phidot. This will mask out any invalid results
			// from pow function, etc
			__m128 OutsideMask=_mm_cmpgt_ps(dot2,MMReplicate(m_PhiDot)); // outside light cone?
			strength=_mm_and_ps(OutsideMask,strength);
		}
		break;
			
		case MATERIAL_LIGHT_DIRECTIONAL:
			break;
			
		default:
			break;

	}
	strength=_mm_mul_ps(strength,falloff);
	color.x=_mm_add_ps(color.x,_mm_mul_ps(strength,MMReplicate(m_Color.x)));
	color.y=_mm_add_ps(color.y,_mm_mul_ps(strength,MMReplicate(m_Color.y)));
	color.z=_mm_add_ps(color.z,_mm_mul_ps(strength,MMReplicate(m_Color.z)));
}

Exemplo n.º 7

0

Exibir arquivo

Arquivo: sse2-int16.c Projeto: Distrotech/babl

static long
conv_rgba16_rgbAF (const uint16_t *src, float *dst, long samples)
{
  long i = 0;
  long remainder;

  if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
    {
      long           n  = (samples / 2) * 2;
      const __m128i *s  = (const __m128i*) src;
            __v4sf  *d  = (__v4sf*) dst;

      const __v4sf  max_mask = { 0.0f, 0.0f, 0.0f, 1.0f };

      for (; i < n / 2; i++)
        {
          /* Expand shorts to ints by loading zero in the high bits */
          const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], (__m128i)_mm_setzero_ps());
          const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], (__m128i)_mm_setzero_ps());

          /* Convert to float */
          const __m128  u0 = _mm_cvtepi32_ps (t0);
          const __m128  u1 = _mm_cvtepi32_ps (t1);

          /* Multiply by 1 / 65535 */
          __v4sf rgba0 = u0 * u16_float;
          __v4sf rgba1 = u1 * u16_float;
          
          /* Expand alpha */
          __v4sf aaaa0 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba0, _MM_SHUFFLE(3, 3, 3, 3));
          __v4sf aaaa1 = (__v4sf)_mm_shuffle_epi32((__m128i)rgba1, _MM_SHUFFLE(3, 3, 3, 3));
          
          /* Set the value in the alpha slot to 1.0, we know max is sufficent because alpha was a short */
          aaaa0 = _mm_max_ps(aaaa0, max_mask);
          aaaa1 = _mm_max_ps(aaaa1, max_mask);
          
          /* Premultiply */
          rgba0 = rgba0 * aaaa0;
          rgba1 = rgba1 * aaaa1;
          
          d[2 * i + 0] = rgba0;
          d[2 * i + 1] = rgba1;
        }
      _mm_empty();
    }

  dst += i * 2 * 4;
  src += i * 2 * 4;
  remainder = samples - (i * 2);
  while (remainder--)
  {
    const float a = src[3] / 65535.0f;
    const float a_term = a / 65535.0f;
    dst[0] = src[0] * a_term;
    dst[1] = src[1] * a_term;
    dst[2] = src[2] * a_term;
    dst[3] = a;
    
    src += 4;
    dst += 4;
  }

  return samples;
}

Exemplo n.º 8

0

Exibir arquivo

Arquivo: colorspace_transform_avx.c Projeto: dmuktro/rawstudio

void
transform8_otherrgb_avx(ThreadInfo* t)
{
	RS_IMAGE16 *input = t->input;
	GdkPixbuf *output = t->output;
	RS_MATRIX3 *matrix = t->matrix;
	gint x,y;
	gint width;

	float mat_ps[4*4*3] __attribute__ ((aligned (16)));
	for (x = 0; x < 4; x++ ) {
		mat_ps[x] = matrix->coeff[0][0];
		mat_ps[x+4] = matrix->coeff[0][1];
		mat_ps[x+8] = matrix->coeff[0][2];
		mat_ps[12+x] = matrix->coeff[1][0];
		mat_ps[12+x+4] = matrix->coeff[1][1];
		mat_ps[12+x+8] = matrix->coeff[1][2];
		mat_ps[24+x] = matrix->coeff[2][0];
		mat_ps[24+x+4] = matrix->coeff[2][1];
		mat_ps[24+x+8] = matrix->coeff[2][2];
	}
	
	int start_x = t->start_x;
	/* Always have aligned input and output adress */
	if (start_x & 3)
		start_x = ((start_x) / 4) * 4;
	
	int complete_w = t->end_x - start_x;
	/* If width is not multiple of 4, check if we can extend it a bit */
	if (complete_w & 3)
	{
		if ((t->end_x+4) < input->w)
			complete_w = ((complete_w+3) / 4 * 4);
	}
	__m128 gamma = _mm_set1_ps(t->output_gamma);

	for(y=t->start_y ; y<t->end_y ; y++)
	{
		gushort *i = GET_PIXEL(input, start_x, y);
		guchar *o = GET_PIXBUF_PIXEL(output, start_x, y);
		gboolean aligned_write = !((guintptr)(o)&0xf);

		width = complete_w >> 2;

		while(width--)
		{
			/* Load and convert to float */
			__m128i zero = _mm_setzero_si128();
			__m128i in = _mm_load_si128((__m128i*)i); // Load two pixels
			__m128i in2 = _mm_load_si128((__m128i*)i+1); // Load two pixels
			_mm_prefetch(i + 64, _MM_HINT_NTA);
			__m128i p1 =_mm_unpacklo_epi16(in, zero);
			__m128i p2 =_mm_unpackhi_epi16(in, zero);
			__m128i p3 =_mm_unpacklo_epi16(in2, zero);
			__m128i p4 =_mm_unpackhi_epi16(in2, zero);
			__m128 p1f  = _mm_cvtepi32_ps(p1);
			__m128 p2f  = _mm_cvtepi32_ps(p2);
			__m128 p3f  = _mm_cvtepi32_ps(p3);
			__m128 p4f  = _mm_cvtepi32_ps(p4);
			
			/* Convert to planar */
			__m128 g1g0r1r0 = _mm_unpacklo_ps(p1f, p2f);
			__m128 b1b0 = _mm_unpackhi_ps(p1f, p2f);
			__m128 g3g2r3r2 = _mm_unpacklo_ps(p3f, p4f);
			__m128 b3b2 = _mm_unpackhi_ps(p3f, p4f);
			__m128 r = _mm_movelh_ps(g1g0r1r0, g3g2r3r2);
			__m128 g = _mm_movehl_ps(g3g2r3r2, g1g0r1r0);
			__m128 b = _mm_movelh_ps(b1b0, b3b2);

			/* Apply matrix to convert to sRGB */
			__m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
			__m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
			__m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);

			/* Normalize to 0->1 and clamp */
			__m128 normalize = _mm_load_ps(_normalize);
			__m128 max_val = _mm_load_ps(_ones_ps);
			__m128 min_val = _mm_setzero_ps();
			r = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, r2)));
			g = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, g2)));
			b = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, b2)));

			/* Apply Gamma */
			__m128 upscale = _mm_load_ps(_8bit);
			r = _mm_mul_ps(upscale, _mm_fastpow_ps(r, gamma));
			g = _mm_mul_ps(upscale, _mm_fastpow_ps(g, gamma));
			b = _mm_mul_ps(upscale, _mm_fastpow_ps(b, gamma));

			/* Convert to 8 bit unsigned  and interleave*/
			__m128i r_i = _mm_cvtps_epi32(r);
			__m128i g_i = _mm_cvtps_epi32(g);
			__m128i b_i = _mm_cvtps_epi32(b);
			
			r_i = _mm_packs_epi32(r_i, r_i);
			g_i = _mm_packs_epi32(g_i, g_i);
			b_i = _mm_packs_epi32(b_i, b_i);

			/* Set alpha value to 255 and store */
			__m128i alpha_mask = _mm_load_si128((__m128i*)_alpha_mask);
			__m128i rg_i = _mm_unpacklo_epi16(r_i, g_i);
			__m128i bb_i = _mm_unpacklo_epi16(b_i, b_i);
			p1 = _mm_unpacklo_epi32(rg_i, bb_i);
			p2 = _mm_unpackhi_epi32(rg_i, bb_i);
	
			p1 = _mm_or_si128(alpha_mask, _mm_packus_epi16(p1, p2));

			if (aligned_write)
				_mm_store_si128((__m128i*)o, p1);
			else
				_mm_storeu_si128((__m128i*)o, p1);

			i += 16;
			o += 16;
		}
		/* Process remaining pixels */
		width = complete_w & 3;
		while(width--)
		{
			__m128i zero = _mm_setzero_si128();
			__m128i in = _mm_loadl_epi64((__m128i*)i); // Load two pixels
			__m128i p1 =_mm_unpacklo_epi16(in, zero);
			__m128 p1f  = _mm_cvtepi32_ps(p1);

			/* Splat r,g,b */
			__m128 r =  _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(0,0,0,0));
			__m128 g =  _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(1,1,1,1));
			__m128 b =  _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(2,2,2,2));

			__m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
			__m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
			__m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);

			r = _mm_unpacklo_ps(r2, g2);	// GG RR GG RR
			r = _mm_movelh_ps(r, b2);		// BB BB GG RR

			__m128 normalize = _mm_load_ps(_normalize);
			__m128 max_val = _mm_load_ps(_ones_ps);
			__m128 min_val = _mm_setzero_ps();
			r = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, r)));
			__m128 upscale = _mm_load_ps(_8bit);
			r = _mm_mul_ps(upscale, _mm_fastpow_ps(r, gamma));
			
			/* Convert to 8 bit unsigned */
			zero = _mm_setzero_si128();
			__m128i r_i = _mm_cvtps_epi32(r);
			/* To 16 bit signed */
			r_i = _mm_packs_epi32(r_i, zero);
			/* To 8 bit unsigned - set alpha channel*/
			__m128i alpha_mask = _mm_load_si128((__m128i*)_alpha_mask);
			r_i = _mm_or_si128(alpha_mask, _mm_packus_epi16(r_i, zero));
			*(int*)o = _mm_cvtsi128_si32(r_i);
			i+=4;
			o+=4;
		}
	}
}

Exemplo n.º 9

0

Exibir arquivo

Arquivo: juce_FloatVectorOperations.cpp Projeto: orbitfold/Obxd

 static forcedinline ParallelType max (ParallelType a, ParallelType b) noexcept  { return _mm_max_ps (a, b); }

Exemplo n.º 10

0

Exibir arquivo

Arquivo: OBSCapture.cpp Projeto: ascendedguard/OBS

void MixAudio(float *bufferDest, float *bufferSrc, UINT totalFloats, bool bForceMono)
{
    UINT floatsLeft = totalFloats;
    float *destTemp = bufferDest;
    float *srcTemp  = bufferSrc;

    if((UPARAM(destTemp) & 0xF) == 0 && (UPARAM(srcTemp) & 0xF) == 0)
    {
        UINT alignedFloats = floatsLeft & 0xFFFFFFFC;

        if(bForceMono)
        {
            __m128 halfVal = _mm_set_ps1(0.5f);
            for(UINT i=0; i<alignedFloats; i += 4)
            {
                float *micInput = srcTemp+i;
                __m128 val = _mm_load_ps(micInput);
                __m128 shufVal = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));

                _mm_store_ps(micInput, _mm_mul_ps(_mm_add_ps(val, shufVal), halfVal));
            }
        }

        __m128 maxVal = _mm_set_ps1(1.0f);
        __m128 minVal = _mm_set_ps1(-1.0f);

        for(UINT i=0; i<alignedFloats; i += 4)
        {
            float *pos = destTemp+i;

            __m128 mix;
            mix = _mm_add_ps(_mm_load_ps(pos), _mm_load_ps(srcTemp+i));
            mix = _mm_min_ps(mix, maxVal);
            mix = _mm_max_ps(mix, minVal);

            _mm_store_ps(pos, mix);
        }

        floatsLeft  &= 0x3;
        destTemp    += alignedFloats;
        srcTemp     += alignedFloats;
    }

    if(floatsLeft)
    {
        if(bForceMono)
        {
            for(UINT i=0; i<floatsLeft; i += 2)
            {
                srcTemp[i] += srcTemp[i+1];
                srcTemp[i] *= 0.5f;
                srcTemp[i+1] = srcTemp[i];
            }
        }

        for(UINT i=0; i<floatsLeft; i++)
        {
            float val = destTemp[i]+srcTemp[i];

            if(val < -1.0f)     val = -1.0f;
            else if(val > 1.0f) val = 1.0f;

            destTemp[i] = val;
        }
    }
}

Exemplo n.º 11

0

Exibir arquivo

Arquivo: tedge_sse2.c Projeto: chikuzen/GenericFilters

static void GF_FUNC_ALIGN VS_CC
proc_16bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride,
                uint8_t *d, const uint8_t *s, edge_t *eh, uint16_t plane_max)
{
    const uint16_t *srcp = (uint16_t *)s;
    uint16_t *dstp = (uint16_t *)d;
    stride /= 2;
    bstride /= 2;

    uint16_t* p0 = (uint16_t *)buff + 8;
    uint16_t* p1 = p0 + bstride;
    uint16_t* p2 = p1 + bstride;
    uint16_t* p3 = p2 + bstride;
    uint16_t* p4 = p3 + bstride;
    uint16_t *orig = p0, *end = p4;

    line_copy16(p0, srcp + 2 * stride, width, 2);
    line_copy16(p1, srcp + stride, width, 2);
    line_copy16(p2, srcp, width, 2);
    srcp += stride;
    line_copy16(p3, srcp, width, 2);

    __m128i zero = _mm_setzero_si128();
    __m128 alpha = _mm_set1_ps((float)0.96043387);
    __m128 beta = _mm_set1_ps((float)0.39782473);
    __m128i pmax = _mm_set1_epi32(0xFFFF);
    __m128i min = _mm_set1_epi16((int16_t)eh->min);
    __m128i max = _mm_set1_epi16((int16_t)eh->max);

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 2 ? 1 : -1);
        line_copy16(p4, srcp, width, 2);
        uint16_t* posh[] = {p2 - 2, p2 - 1, p2 + 1, p2 + 2};
        uint16_t* posv[] = {p0, p1, p3, p4};

        for (int x = 0; x < width; x += 8) {
            __m128 sumx[2] = {(__m128)zero, (__m128)zero};
            __m128 sumy[2] = {(__m128)zero, (__m128)zero};

            for (int i = 0; i < 4; i++) {
                __m128 xmul = _mm_load_ps(ar_mulxf[i]);
                __m128i xmm0 = _mm_loadu_si128((__m128i *)(posh[i] + x));
                __m128i xmm1 = _mm_unpackhi_epi16(xmm0, zero);
                xmm0 = _mm_unpacklo_epi16(xmm0, zero);
                sumx[0] = _mm_add_ps(sumx[0], _mm_mul_ps(_mm_cvtepi32_ps(xmm0), xmul));
                sumx[1] = _mm_add_ps(sumx[1], _mm_mul_ps(_mm_cvtepi32_ps(xmm1), xmul));

                xmul = _mm_load_ps(ar_mulyf[i]);
                xmm0 = _mm_load_si128((__m128i *)(posv[i] + x));
                xmm1 = _mm_unpackhi_epi16(xmm0, zero);
                xmm0 = _mm_unpacklo_epi16(xmm0, zero);
                sumy[0] = _mm_add_ps(sumy[0], _mm_mul_ps(_mm_cvtepi32_ps(xmm0), xmul));
                sumy[1] = _mm_add_ps(sumy[1], _mm_mul_ps(_mm_cvtepi32_ps(xmm1), xmul));
            }

            __m128i out[2];
            for (int i = 0; i < 2; i++) {
                sumx[i] = mm_abs_ps(sumx[i]);
                sumy[i] = mm_abs_ps(sumy[i]);
                __m128 t0 = _mm_max_ps(sumx[i], sumy[i]);
                __m128 t1 = _mm_min_ps(sumx[i], sumy[i]);
                t0 = _mm_add_ps(_mm_mul_ps(alpha, t0), _mm_mul_ps(beta, t1));
                out[i] = _mm_srli_epi32(_mm_cvtps_epi32(t0), eh->rshift);
                out[i] = mm_min_epi32(out[i], pmax);
            }
            out[0] = mm_cast_epi32(out[0], out[1]);

            out[1] = MM_MIN_EPU16(out[0], max);
            out[1] = _mm_cmpeq_epi16(out[1], max);
            out[0] = _mm_or_si128(out[1], out[0]);

            out[1] = MM_MAX_EPU16(out[0], min);
            out[1] = _mm_cmpeq_epi16(out[1], min);
            out[0] = _mm_andnot_si128(out[1], out[0]);

            _mm_store_si128((__m128i *)(dstp + x), out[0]);
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = (p4 == end) ? orig : p4 + bstride;
    }
}

Exemplo n.º 12

0

Exibir arquivo

Arquivo: invert.c Projeto: olyoberdorf/darktable

static gboolean draw(GtkWidget *widget, cairo_t *cr, dt_iop_module_t *self)
{
  if(darktable.gui->reset) return FALSE;
  if(self->picked_color_max[0] < 0.0f) return FALSE;
  if(self->request_color_pick == DT_REQUEST_COLORPICK_OFF) return FALSE;
  dt_iop_invert_gui_data_t *g = (dt_iop_invert_gui_data_t *)self->gui_data;
  dt_iop_invert_params_t *p = (dt_iop_invert_params_t *)self->params;

  if(fabsf(p->color[0] - self->picked_color[0]) < 0.0001f
     && fabsf(p->color[1] - self->picked_color[1]) < 0.0001f
     && fabsf(p->color[2] - self->picked_color[2]) < 0.0001f)
  {
    // interrupt infinite loops
    return FALSE;
  }

  p->color[0] = self->picked_color[0];
  p->color[1] = self->picked_color[1];
  p->color[2] = self->picked_color[2];
  GdkRGBA color = (GdkRGBA){.red = p->color[0], .green = p->color[1], .blue = p->color[2], .alpha = 1.0 };
  gtk_color_chooser_set_rgba(GTK_COLOR_CHOOSER(g->colorpicker), &color);

  dt_dev_add_history_item(darktable.develop, self, TRUE);
  return FALSE;
}

static void colorpicker_callback(GtkColorButton *widget, dt_iop_module_t *self)
{
  if(self->dt->gui->reset) return;
  dt_iop_invert_gui_data_t *g = (dt_iop_invert_gui_data_t *)self->gui_data;
  dt_iop_invert_params_t *p = (dt_iop_invert_params_t *)self->params;

  // turn off the other color picker so that this tool actually works ...
  gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(g->picker), FALSE);

  GdkRGBA c;
  gtk_color_chooser_get_rgba(GTK_COLOR_CHOOSER(widget), &c);
  p->color[0] = c.red;
  p->color[1] = c.green;
  p->color[2] = c.blue;

  dt_dev_add_history_item(darktable.develop, self, TRUE);
}

void process(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid,
             const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  dt_iop_invert_data_t *d = (dt_iop_invert_data_t *)piece->data;

  const float *const m = piece->pipe->processed_maximum;

  float film_rgb[4] = { d->color[0], d->color[1], d->color[2], 0.0f };

  // Convert the RGB color to CYGM only if we're not in the preview pipe (which is already RGB)
  if((self->dev->image_storage.flags & DT_IMAGE_4BAYER) && !dt_dev_pixelpipe_uses_downsampled_input(piece->pipe))
    dt_colorspaces_rgb_to_cygm(film_rgb, 1, d->RGB_to_CAM);

  const float film_rgb_f[4] = { film_rgb[0] * m[0], film_rgb[1] * m[1], film_rgb[2] * m[2], film_rgb[3] * m[3] };

  // FIXME: it could be wise to make this a NOP when picking colors. not sure about that though.
  //   if(self->request_color_pick){
  // do nothing
  //   }

  const int filters = dt_image_filter(&piece->pipe->image);
  const uint8_t (*const xtrans)[6] = (const uint8_t (*const)[6]) self->dev->image_storage.xtrans;

  if(!dt_dev_pixelpipe_uses_downsampled_input(piece->pipe) && (filters == 9u))
  { // xtrans float mosaiced
#ifdef _OPENMP
#pragma omp parallel for default(none) shared(roi_out, ivoid, ovoid) schedule(static)
#endif
    for(int j = 0; j < roi_out->height; j++)
    {
      const float *in = ((float *)ivoid) + (size_t)j * roi_out->width;
      float *out = ((float *)ovoid) + (size_t)j * roi_out->width;
      for(int i = 0; i < roi_out->width; i++, out++, in++)
        *out = CLAMP(film_rgb_f[FCxtrans(j, i, roi_out, xtrans)] - *in, 0.0f, 1.0f);
    }

    for(int k = 0; k < 4; k++) piece->pipe->processed_maximum[k] = 1.0f;
  }
  else if(!dt_dev_pixelpipe_uses_downsampled_input(piece->pipe) && filters)
  { // bayer float mosaiced

    const __m128 val_min = _mm_setzero_ps();
    const __m128 val_max = _mm_set1_ps(1.0f);

#ifdef _OPENMP
#pragma omp parallel for default(none) shared(roi_out, ivoid, ovoid) schedule(static)
#endif
    for(int j = 0; j < roi_out->height; j++)
    {
      const float *in = ((float *)ivoid) + (size_t)j * roi_out->width;
      float *out = ((float *)ovoid) + (size_t)j * roi_out->width;

      int i = 0;
      int alignment = ((4 - (j * roi_out->width & (4 - 1))) & (4 - 1));

      // process unaligned pixels
      for(; i < alignment; i++, out++, in++)
        *out = CLAMP(film_rgb_f[FC(j + roi_out->y, i + roi_out->x, filters)] - *in, 0.0f, 1.0f);

      const __m128 film = _mm_set_ps(film_rgb_f[FC(j + roi_out->y, roi_out->x + i + 3, filters)],
                                     film_rgb_f[FC(j + roi_out->y, roi_out->x + i + 2, filters)],
                                     film_rgb_f[FC(j + roi_out->y, roi_out->x + i + 1, filters)],
                                     film_rgb_f[FC(j + roi_out->y, roi_out->x + i, filters)]);

      // process aligned pixels with SSE
      for(; i < roi_out->width - (4 - 1); i += 4, in += 4, out += 4)
      {
        const __m128 input = _mm_load_ps(in);
        const __m128 subtracted = _mm_sub_ps(film, input);
        _mm_stream_ps(out, _mm_max_ps(_mm_min_ps(subtracted, val_max), val_min));
      }

      // process the rest
      for(; i < roi_out->width; i++, out++, in++)
        *out = CLAMP(film_rgb_f[FC(j + roi_out->y, i + roi_out->x, filters)] - *in, 0.0f, 1.0f);
    }
    _mm_sfence();

    for(int k = 0; k < 4; k++) piece->pipe->processed_maximum[k] = 1.0f;
  }
  else
  { // non-mosaiced
    const int ch = piece->colors;

    const __m128 film = _mm_set_ps(1.0f, film_rgb[2], film_rgb[1], film_rgb[0]);

#ifdef _OPENMP
#pragma omp parallel for default(none) shared(roi_out, ivoid, ovoid) schedule(static)
#endif
    for(int k = 0; k < roi_out->height; k++)
    {
      const float *in = ((float *)ivoid) + (size_t)ch * k * roi_out->width;
      float *out = ((float *)ovoid) + (size_t)ch * k * roi_out->width;
      for(int j = 0; j < roi_out->width; j++, in += ch, out += ch)
      {
        const __m128 input = _mm_load_ps(in);
        const __m128 subtracted = _mm_sub_ps(film, input);
        _mm_stream_ps(out, subtracted);
      }
    }
    _mm_sfence();

    if(piece->pipe->mask_display) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
  }
}

Exemplo n.º 13

0

Exibir arquivo

Arquivo: sse-maxps-1.c Projeto: MaxKellermann/gcc

test (__m128 s1, __m128 s2)
{
  return _mm_max_ps (s1, s2); 
}

Exemplo n.º 14

0

Exibir arquivo

Arquivo: Occluder.cpp Projeto: xyzr0482/rasterizer

std::unique_ptr<Occluder> Occluder::bake(const std::vector<__m128>& vertices, __m128 refMin, __m128 refMax)
{
  assert(vertices.size() % 16 == 0);

  // Simple k-means clustering by normal direction to improve backface culling efficiency
  std::vector<__m128> quadNormals;
  for (auto i = 0; i < vertices.size(); i += 4)
  {
    auto v0 = vertices[i + 0];
    auto v1 = vertices[i + 1];
    auto v2 = vertices[i + 2];
    auto v3 = vertices[i + 3];

    quadNormals.push_back(normalize(_mm_add_ps(normal(v0, v1, v2), normal(v0, v2, v3))));
  }

  std::vector<__m128> centroids;
  std::vector<uint32_t> centroidAssignment;
  centroids.push_back(_mm_setr_ps(+1.0f, 0.0f, 0.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(0.0f, +1.0f, 0.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(0.0f, 0.0f, +1.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(0.0f, -1.0f, 0.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(0.0f, 0.0f, -1.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(-1.0f, 0.0f, 0.0f, 0.0f));

  centroidAssignment.resize(vertices.size() / 4);

  bool anyChanged = true;
  for (int iter = 0; iter < 10 && anyChanged; ++iter)
  {
    anyChanged = false;

    for (auto j = 0; j < quadNormals.size(); ++j)
    {
      __m128 normal = quadNormals[j];

      __m128 bestDistance = _mm_set1_ps(-std::numeric_limits<float>::infinity());
      int bestCentroid = -1;
      for (int k = 0; k < centroids.size(); ++k)
      {
        __m128 distance = _mm_dp_ps(centroids[k], normal, 0x7F);
        if (_mm_comige_ss(distance, bestDistance))
        {
          bestDistance = distance;
          bestCentroid = k;
        }
      }

      if (centroidAssignment[j] != bestCentroid)
      {
        centroidAssignment[j] = bestCentroid;
        anyChanged = true;
      }
    }

    for (int k = 0; k < centroids.size(); ++k)
    {
      centroids[k] = _mm_setzero_ps();
    }

    for (int j = 0; j < quadNormals.size(); ++j)
    {
      int k = centroidAssignment[j];

      centroids[k] = _mm_add_ps(centroids[k], quadNormals[j]);
    }

    for (int k = 0; k < centroids.size(); ++k)
    {
      centroids[k] = normalize(centroids[k]);
    }
  }

  std::vector<__m128> orderedVertices;
  for (int k = 0; k < centroids.size(); ++k)
  {
    for (int j = 0; j < vertices.size() / 4; ++j)
    {
      if (centroidAssignment[j] == k)
      {
        orderedVertices.push_back(vertices[4 * j + 0]);
        orderedVertices.push_back(vertices[4 * j + 1]);
        orderedVertices.push_back(vertices[4 * j + 2]);
        orderedVertices.push_back(vertices[4 * j + 3]);
      }
    }
  }

  auto occluder = std::make_unique<Occluder>();

  __m128 invExtents = _mm_div_ps(_mm_set1_ps(1.0f), _mm_sub_ps(refMax, refMin));

  __m128 scalingX = _mm_set1_ps(2047.0f);
  __m128 scalingY = _mm_set1_ps(2047.0f);
  __m128 scalingZ = _mm_set1_ps(1023.0f);

  __m128 half = _mm_set1_ps(0.5f);

  for (size_t i = 0; i < orderedVertices.size(); i += 16)
  {
    for (auto j = 0; j < 4; ++j)
    {
      // Transform into [0,1] space relative to bounding box
      __m128 v0 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 0], refMin), invExtents);
      __m128 v1 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 4], refMin), invExtents);
      __m128 v2 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 8], refMin), invExtents);
      __m128 v3 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 12], refMin), invExtents);

      // Transpose into [xxxx][yyyy][zzzz][wwww]
      _MM_TRANSPOSE4_PS(v0, v1, v2, v3);

      // Scale and truncate to int
      v0 = _mm_fmadd_ps(v0, scalingX, half);
      v1 = _mm_fmadd_ps(v1, scalingY, half);
      v2 = _mm_fmadd_ps(v2, scalingZ, half);

      __m128i X = _mm_cvttps_epi32(v0);
      __m128i Y = _mm_cvttps_epi32(v1);
      __m128i Z = _mm_cvttps_epi32(v2);

      // Pack to 11/11/10 format
      __m128i XYZ = _mm_or_si128(_mm_slli_epi32(X, 21), _mm_or_si128(_mm_slli_epi32(Y, 10), Z));

      occluder->m_vertexData.push_back(XYZ);
    }
  }

  occluder->m_refMin = refMin;
  occluder->m_refMax = refMax;

  __m128 min = _mm_set1_ps(+std::numeric_limits<float>::infinity());
  __m128 max = _mm_set1_ps(-std::numeric_limits<float>::infinity());

  for (size_t i = 0; i < orderedVertices.size(); ++i)
  {
    min = _mm_min_ps(vertices[i], min);
    max = _mm_max_ps(vertices[i], max);
  }

  // Set W = 1 - this is expected by frustum culling code
  min = _mm_blend_ps(min, _mm_set1_ps(1.0f), 0b1000);
  max = _mm_blend_ps(max, _mm_set1_ps(1.0f), 0b1000);

  occluder->m_boundsMin = min;
  occluder->m_boundsMax = max;

  occluder->m_center = _mm_mul_ps(_mm_add_ps(max, min), _mm_set1_ps(0.5f));

  return occluder;
}

Exemplo n.º 15

0

Exibir arquivo

Arquivo: aec_core_sse2.c Projeto: AGProjects/python-sipsimple

static __m128 mm_pow_ps(__m128 a, __m128 b)
{
  // a^b = exp2(b * log2(a))
  //   exp2(x) and log2(x) are calculated using polynomial approximations.
  __m128 log2_a, b_log2_a, a_exp_b;

  // Calculate log2(x), x = a.
  {
    // To calculate log2(x), we decompose x like this:
    //   x = y * 2^n
    //     n is an integer
    //     y is in the [1.0, 2.0) range
    //
    //   log2(x) = log2(y) + n
    //     n       can be evaluated by playing with float representation.
    //     log2(y) in a small range can be approximated, this code uses an order
    //             five polynomial approximation. The coefficients have been
    //             estimated with the Remez algorithm and the resulting
    //             polynomial has a maximum relative error of 0.00086%.

    // Compute n.
    //    This is done by masking the exponent, shifting it into the top bit of
    //    the mantissa, putting eight into the biased exponent (to shift/
    //    compensate the fact that the exponent has been shifted in the top/
    //    fractional part and finally getting rid of the implicit leading one
    //    from the mantissa by substracting it out.
    static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END =
        {0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
    static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END =
        {0x43800000, 0x43800000, 0x43800000, 0x43800000};
    static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END =
        {0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000};
    static const int shift_exponent_into_top_mantissa = 8;
    const __m128 two_n = _mm_and_ps(a, *((__m128 *)float_exponent_mask));
    const __m128 n_1 = (__m128)_mm_srli_epi32((__m128i)two_n,
        shift_exponent_into_top_mantissa);
    const __m128 n_0 = _mm_or_ps(
        (__m128)n_1, *((__m128 *)eight_biased_exponent));
    const __m128 n   = _mm_sub_ps(n_0,  *((__m128 *)implicit_leading_one));

    // Compute y.
    static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END =
        {0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF};
    static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END =
        {0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000};
    const __m128 mantissa = _mm_and_ps(a, *((__m128 *)mantissa_mask));
    const __m128 y        = _mm_or_ps(
        mantissa,  *((__m128 *)zero_biased_exponent_is_one));

    // Approximate log2(y) ~= (y - 1) * pol5(y).
    //    pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
    static const ALIGN16_BEG float ALIGN16_END C5[4] =
        {-3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f};
    static const ALIGN16_BEG float ALIGN16_END C4[4] =
        {3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f};
    static const ALIGN16_BEG float ALIGN16_END C3[4] =
        {-1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f};
    static const ALIGN16_BEG float ALIGN16_END C2[4] =
        {2.5988452f, 2.5988452f, 2.5988452f, 2.5988452f};
    static const ALIGN16_BEG float ALIGN16_END C1[4] =
        {-3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f};
    static const ALIGN16_BEG float ALIGN16_END C0[4] =
        {3.1157899f, 3.1157899f, 3.1157899f, 3.1157899f};
    const __m128 pol5_y_0 = _mm_mul_ps(y,        *((__m128 *)C5));
    const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128 *)C4));
    const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y);
    const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128 *)C3));
    const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y);
    const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128 *)C2));
    const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y);
    const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128 *)C1));
    const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y);
    const __m128 pol5_y   = _mm_add_ps(pol5_y_8, *((__m128 *)C0));
    const __m128 y_minus_one = _mm_sub_ps(
        y, *((__m128 *)zero_biased_exponent_is_one));
    const __m128 log2_y = _mm_mul_ps(y_minus_one ,  pol5_y);

    // Combine parts.
    log2_a = _mm_add_ps(n, log2_y);
  }

  // b * log2(a)
  b_log2_a = _mm_mul_ps(b, log2_a);

  // Calculate exp2(x), x = b * log2(a).
  {
    // To calculate 2^x, we decompose x like this:
    //   x = n + y
    //     n is an integer, the value of x - 0.5 rounded down, therefore
    //     y is in the [0.5, 1.5) range
    //
    //   2^x = 2^n * 2^y
    //     2^n can be evaluated by playing with float representation.
    //     2^y in a small range can be approximated, this code uses an order two
    //         polynomial approximation. The coefficients have been estimated
    //         with the Remez algorithm and the resulting polynomial has a
    //         maximum relative error of 0.17%.

    // To avoid over/underflow, we reduce the range of input to ]-127, 129].
    static const ALIGN16_BEG float max_input[4] ALIGN16_END =
        {129.f, 129.f, 129.f, 129.f};
    static const ALIGN16_BEG float min_input[4] ALIGN16_END =
        {-126.99999f, -126.99999f, -126.99999f, -126.99999f};
    const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128 *)max_input));
    const __m128 x_max = _mm_max_ps(x_min,    *((__m128 *)min_input));
    // Compute n.
    static const ALIGN16_BEG float half[4] ALIGN16_END =
        {0.5f, 0.5f, 0.5f, 0.5f};
    const __m128  x_minus_half = _mm_sub_ps(x_max, *((__m128 *)half));
    const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half);
    // Compute 2^n.
    static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END =
        {127, 127, 127, 127};
    static const int float_exponent_shift = 23;
    const __m128i two_n_exponent = _mm_add_epi32(
        x_minus_half_floor, *((__m128i *)float_exponent_bias));
    const __m128  two_n = (__m128)_mm_slli_epi32(
        two_n_exponent, float_exponent_shift);
    // Compute y.
    const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor));
    // Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
    static const ALIGN16_BEG float C2[4] ALIGN16_END =
        {3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f};
    static const ALIGN16_BEG float C1[4] ALIGN16_END =
        {6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f};
    static const ALIGN16_BEG float C0[4] ALIGN16_END =
        {1.0017247f, 1.0017247f, 1.0017247f, 1.0017247f};
    const __m128 exp2_y_0 = _mm_mul_ps(y,        *((__m128 *)C2));
    const __m128 exp2_y_1 = _mm_add_ps(exp2_y_0, *((__m128 *)C1));
    const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y);
    const __m128 exp2_y   = _mm_add_ps(exp2_y_2, *((__m128 *)C0));

    // Combine parts.
    a_exp_b = _mm_mul_ps(exp2_y, two_n);
  }
  return a_exp_b;
}

Exemplo n.º 16

0

Exibir arquivo

Arquivo: AABB.cpp Projeto: chengzg/MathGeoLib

bool AABB::IntersectLineAABB_SSE(const float4 &rayPos, const float4 &rayDir, float tNear, float tFar) const
{
	assume(rayDir.IsNormalized4());
	assume(tNear <= tFar && "AABB::IntersectLineAABB: User gave a degenerate line as input for the intersection test!");
	/* For reference, this is the C++ form of the vectorized SSE code below.

	float4 recipDir = rayDir.RecipFast4();
	float4 t1 = (aabbMinPoint - rayPos).Mul(recipDir);
	float4 t2 = (aabbMaxPoint - rayPos).Mul(recipDir);
	float4 near = t1.Min(t2);
	float4 far = t1.Max(t2);
	float4 rayDirAbs = rayDir.Abs();

	if (rayDirAbs.x > 1e-4f) // ray is parallel to plane in question
	{
		tNear = Max(near.x, tNear); // tNear tracks distance to intersect (enter) the AABB.
		tFar = Min(far.x, tFar); // tFar tracks the distance to exit the AABB.
	}
	else if (rayPos.x < aabbMinPoint.x || rayPos.x > aabbMaxPoint.x) // early-out if the ray can't possibly enter the box.
		return false;

	if (rayDirAbs.y > 1e-4f) // ray is parallel to plane in question
	{
		tNear = Max(near.y, tNear); // tNear tracks distance to intersect (enter) the AABB.
		tFar = Min(far.y, tFar); // tFar tracks the distance to exit the AABB.
	}
	else if (rayPos.y < aabbMinPoint.y || rayPos.y > aabbMaxPoint.y) // early-out if the ray can't possibly enter the box.
		return false;

	if (rayDirAbs.z > 1e-4f) // ray is parallel to plane in question
	{
		tNear = Max(near.z, tNear); // tNear tracks distance to intersect (enter) the AABB.
		tFar = Min(far.z, tFar); // tFar tracks the distance to exit the AABB.
	}
	else if (rayPos.z < aabbMinPoint.z || rayPos.z > aabbMaxPoint.z) // early-out if the ray can't possibly enter the box.
		return false;

	return tNear < tFar;
	*/

	__m128 recipDir = _mm_rcp_ps(rayDir.v);
	// Note: The above performs an approximate reciprocal (11 bits of precision).
	// For a full precision reciprocal, perform a div:
//	__m128 recipDir = _mm_div_ps(_mm_set1_ps(1.f), rayDir.v);

	__m128 t1 = _mm_mul_ps(_mm_sub_ps(MinPoint_SSE(), rayPos.v), recipDir);
	__m128 t2 = _mm_mul_ps(_mm_sub_ps(MaxPoint_SSE(), rayPos.v), recipDir);

	__m128 nearD = _mm_min_ps(t1, t2); // [0 n3 n2 n1]
	__m128 farD = _mm_max_ps(t1, t2);  // [0 f3 f2 f1]

	// Check if the ray direction is parallel to any of the cardinal axes, and if so,
	// mask those [near, far] ranges away from the hit test computations.
	__m128 rayDirAbs = abs_ps(rayDir.v);

	const __m128 epsilon = _mm_set1_ps(1e-4f);
	// zeroDirections[i] will be nonzero for each axis i the ray is parallel to.
	__m128 zeroDirections = _mm_cmple_ps(rayDirAbs, epsilon);

	const __m128 floatInf = _mm_set1_ps(FLOAT_INF);
	const __m128 floatNegInf = _mm_set1_ps(-FLOAT_INF);

	// If the ray is parallel to one of the axes, replace the slab range for that axis
	// with [-inf, inf] range instead. (which is a no-op in the comparisons below)
	nearD = cmov_ps(nearD, floatNegInf, zeroDirections);
	farD = cmov_ps(farD , floatInf, zeroDirections);

	// Next, we need to compute horizontally max(nearD[0], nearD[1], nearD[2]) and min(farD[0], farD[1], farD[2])
	// to see if there is an overlap in the hit ranges.
	__m128 v1 = _mm_shuffle_ps(nearD, farD, _MM_SHUFFLE(0, 0, 0, 0)); // [f1 f1 n1 n1]
	__m128 v2 = _mm_shuffle_ps(nearD, farD, _MM_SHUFFLE(1, 1, 1, 1)); // [f2 f2 n2 n2]
	__m128 v3 = _mm_shuffle_ps(nearD, farD, _MM_SHUFFLE(2, 2, 2, 2)); // [f3 f3 n3 n3]
	nearD = _mm_max_ps(v1, _mm_max_ps(v2, v3));
	farD = _mm_min_ps(v1, _mm_min_ps(v2, v3));
	farD = _mm_shuffle_ps(farD, farD, _MM_SHUFFLE(3, 3, 3, 3)); // Unpack the result from high offset in the register.
	nearD = _mm_max_ps(nearD, _mm_set_ss(tNear));
	farD = _mm_min_ps(farD, _mm_set_ss(tFar));

	// Finally, test if the ranges overlap.
	__m128 rangeIntersects = _mm_cmple_ss(nearD, farD);

	// To store out out the interval of intersection, uncomment the following:
	// These are disabled, since without these, the whole function runs without a single memory store,
	// which has been profiled to be very fast! Uncommenting these causes an order-of-magnitude slowdown.
	// For now, using the SSE version only where the tNear and tFar ranges are not interesting.
//	_mm_store_ss(&tNear, nearD);
//	_mm_store_ss(&tFar, farD);

	// To avoid false positives, need to have an additional rejection test for each cardinal axis the ray direction
	// is parallel to.
	__m128 out2 = _mm_cmplt_ps(rayPos.v, MinPoint_SSE());
	__m128 out3 = _mm_cmpgt_ps(rayPos.v, MaxPoint_SSE());
	out2 = _mm_or_ps(out2, out3);
	zeroDirections = _mm_and_ps(zeroDirections, out2);

	__m128 yOut = _mm_shuffle_ps(zeroDirections, zeroDirections, _MM_SHUFFLE(1,1,1,1));
	__m128 zOut = _mm_shuffle_ps(zeroDirections, zeroDirections, _MM_SHUFFLE(2,2,2,2));

	zeroDirections = _mm_or_ps(_mm_or_ps(zeroDirections, yOut), zOut);
	// Intersection occurs if the slab ranges had positive overlap and if the test was not rejected by the ray being
	// parallel to some cardinal axis.
	__m128 intersects = _mm_andnot_ps(zeroDirections, rangeIntersects);
	__m128 epsilonMasked = _mm_and_ps(epsilon, intersects);
	return _mm_comieq_ss(epsilon, epsilonMasked) != 0;
}

Exemplo n.º 17

0

Exibir arquivo

Arquivo: graduatednd.c Projeto: dtorop/darktable

void process_sse2(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid,
                  void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out)
{
  const dt_iop_graduatednd_data_t *const data = (const dt_iop_graduatednd_data_t *const)piece->data;
  const int ch = piece->colors;

  const int ix = (roi_in->x);
  const int iy = (roi_in->y);
  const float iw = piece->buf_in.width * roi_out->scale;
  const float ih = piece->buf_in.height * roi_out->scale;
  const float hw = iw / 2.0;
  const float hh = ih / 2.0;
  const float hw_inv = 1.0 / hw;
  const float hh_inv = 1.0 / hh;
  const float v = (-data->rotation / 180) * M_PI;
  const float sinv = sin(v);
  const float cosv = cos(v);
  const float filter_radie = sqrt((hh * hh) + (hw * hw)) / hh;
  const float offset = data->offset / 100.0 * 2;

#if 1
  const float filter_compression = 1.0 / filter_radie
                                   / (1.0 - (0.5 + (data->compression / 100.0) * 0.9 / 2.0)) * 0.5;
#else
  const float compression = data->compression / 100.0f;
  const float t = 1.0f - .8f / (.8f + compression);
  const float c = 1.0f + 1000.0f * powf(4.0, compression);
#endif


  if(data->density > 0)
  {
#ifdef _OPENMP
#pragma omp parallel for default(none) schedule(static)
#endif
    for(int y = 0; y < roi_out->height; y++)
    {
      size_t k = (size_t)roi_out->width * y * ch;
      const float *in = (float *)ivoid + k;
      float *out = (float *)ovoid + k;

      float length = (sinv * (-1.0 + ix * hw_inv) - cosv * (-1.0 + (iy + y) * hh_inv) - 1.0 + offset)
                     * filter_compression;
      const float length_inc = sinv * hw_inv * filter_compression;

      __m128 c = _mm_set_ps(0, data->color[2], data->color[1], data->color[0]);
      __m128 c1 = _mm_sub_ps(_mm_set1_ps(1.0f), c);

      for(int x = 0; x < roi_out->width; x++, in += ch, out += ch)
      {
#if 1
        // !!! approximation is ok only when highest density is 8
        // for input x = (data->density * CLIP( 0.5+length ), calculate 2^x as (e^(ln2*x/8))^8
        // use exp2f approximation to calculate e^(ln2*x/8)
        // in worst case - density==8,CLIP(0.5-length) == 1.0 it gives 0.6% of error
        const float t = 0.693147181f /* ln2 */ * (data->density * CLIP(0.5f + length) / 8.0f);
        float d1 = t * t * 0.5f;
        float d2 = d1 * t * 0.333333333f;
        float d3 = d2 * t * 0.25f;
        float d = 1 + t + d1 + d2 + d3; /* taylor series for e^x till x^4 */
        // printf("%d %d  %f\n",y,x,d);
        __m128 density = _mm_set1_ps(d);
        density = _mm_mul_ps(density, density);
        density = _mm_mul_ps(density, density);
        density = _mm_mul_ps(density, density);
#else
        // use fair exp2f
        __m128 density = _mm_set1_ps(exp2f(data->density * CLIP(0.5f + length)));
#endif

        /* max(0,in / (c + (1-c)*density)) */
        _mm_stream_ps(out, _mm_max_ps(_mm_set1_ps(0.0f),
                                      _mm_div_ps(_mm_load_ps(in), _mm_add_ps(c, _mm_mul_ps(c1, density)))));

        length += length_inc;
      }
    }
  }
  else
  {
#ifdef _OPENMP
#pragma omp parallel for default(none) schedule(static)
#endif
    for(int y = 0; y < roi_out->height; y++)
    {
      size_t k = (size_t)roi_out->width * y * ch;
      const float *in = (float *)ivoid + k;
      float *out = (float *)ovoid + k;

      float length = (sinv * (-1.0f + ix * hw_inv) - cosv * (-1.0f + (iy + y) * hh_inv) - 1.0f + offset)
                     * filter_compression;
      const float length_inc = sinv * hw_inv * filter_compression;

      __m128 c = _mm_set_ps(0, data->color[2], data->color[1], data->color[0]);
      __m128 c1 = _mm_sub_ps(_mm_set1_ps(1.0f), c);

      for(int x = 0; x < roi_out->width; x++, in += ch, out += ch)
      {
#if 1
        // !!! approximation is ok only when lowest density is -8
        // for input x = (-data->density * CLIP( 0.5-length ), calculate 2^x as (e^(ln2*x/8))^8
        // use exp2f approximation to calculate e^(ln2*x/8)
        // in worst case - density==-8,CLIP(0.5-length) == 1.0 it gives 0.6% of error
        const float t = 0.693147181f /* ln2 */ * (-data->density * CLIP(0.5f - length) / 8.0f);
        float d1 = t * t * 0.5f;
        float d2 = d1 * t * 0.333333333f;
        float d3 = d2 * t * 0.25f;
        float d = 1 + t + d1 + d2 + d3; /* taylor series for e^x till x^4 */
        __m128 density = _mm_set1_ps(d);
        density = _mm_mul_ps(density, density);
        density = _mm_mul_ps(density, density);
        density = _mm_mul_ps(density, density);
#else
        __m128 density = _mm_set1_ps(exp2f(-data->density * CLIP(0.5f - length)));
#endif

        /* max(0,in * (c + (1-c)*density)) */
        _mm_stream_ps(out, _mm_max_ps(_mm_set1_ps(0.0f),
                                      _mm_mul_ps(_mm_load_ps(in), _mm_add_ps(c, _mm_mul_ps(c1, density)))));

        length += length_inc;
      }
    }
  }
  _mm_sfence();

  if(piece->pipe->mask_display) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
}

Exemplo n.º 18

0

Exibir arquivo

Arquivo: aec_core_sse2.c Projeto: Crawping/chromium_extract

// Updates the following smoothed  Power Spectral Densities (PSD):
//  - sd  : near-end
//  - se  : residual echo
//  - sx  : far-end
//  - sde : cross-PSD of near-end and residual echo
//  - sxd : cross-PSD of near-end and far-end
//
// In addition to updating the PSDs, also the filter diverge state is determined
// upon actions are taken.
static void SmoothedPSD(AecCore* aec,
                        float efw[2][PART_LEN1],
                        float dfw[2][PART_LEN1],
                        float xfw[2][PART_LEN1],
                        int* extreme_filter_divergence) {
  // Power estimate smoothing coefficients.
  const float* ptrGCoh = aec->extended_filter_enabled
      ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1]
      : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1];
  int i;
  float sdSum = 0, seSum = 0;
  const __m128 vec_15 =  _mm_set1_ps(WebRtcAec_kMinFarendPSD);
  const __m128 vec_GCoh0 = _mm_set1_ps(ptrGCoh[0]);
  const __m128 vec_GCoh1 = _mm_set1_ps(ptrGCoh[1]);
  __m128 vec_sdSum = _mm_set1_ps(0.0f);
  __m128 vec_seSum = _mm_set1_ps(0.0f);

  for (i = 0; i + 3 < PART_LEN1; i += 4) {
    const __m128 vec_dfw0 = _mm_loadu_ps(&dfw[0][i]);
    const __m128 vec_dfw1 = _mm_loadu_ps(&dfw[1][i]);
    const __m128 vec_efw0 = _mm_loadu_ps(&efw[0][i]);
    const __m128 vec_efw1 = _mm_loadu_ps(&efw[1][i]);
    const __m128 vec_xfw0 = _mm_loadu_ps(&xfw[0][i]);
    const __m128 vec_xfw1 = _mm_loadu_ps(&xfw[1][i]);
    __m128 vec_sd = _mm_mul_ps(_mm_loadu_ps(&aec->sd[i]), vec_GCoh0);
    __m128 vec_se = _mm_mul_ps(_mm_loadu_ps(&aec->se[i]), vec_GCoh0);
    __m128 vec_sx = _mm_mul_ps(_mm_loadu_ps(&aec->sx[i]), vec_GCoh0);
    __m128 vec_dfw_sumsq = _mm_mul_ps(vec_dfw0, vec_dfw0);
    __m128 vec_efw_sumsq = _mm_mul_ps(vec_efw0, vec_efw0);
    __m128 vec_xfw_sumsq = _mm_mul_ps(vec_xfw0, vec_xfw0);
    vec_dfw_sumsq = _mm_add_ps(vec_dfw_sumsq, _mm_mul_ps(vec_dfw1, vec_dfw1));
    vec_efw_sumsq = _mm_add_ps(vec_efw_sumsq, _mm_mul_ps(vec_efw1, vec_efw1));
    vec_xfw_sumsq = _mm_add_ps(vec_xfw_sumsq, _mm_mul_ps(vec_xfw1, vec_xfw1));
    vec_xfw_sumsq = _mm_max_ps(vec_xfw_sumsq, vec_15);
    vec_sd = _mm_add_ps(vec_sd, _mm_mul_ps(vec_dfw_sumsq, vec_GCoh1));
    vec_se = _mm_add_ps(vec_se, _mm_mul_ps(vec_efw_sumsq, vec_GCoh1));
    vec_sx = _mm_add_ps(vec_sx, _mm_mul_ps(vec_xfw_sumsq, vec_GCoh1));
    _mm_storeu_ps(&aec->sd[i], vec_sd);
    _mm_storeu_ps(&aec->se[i], vec_se);
    _mm_storeu_ps(&aec->sx[i], vec_sx);

    {
      const __m128 vec_3210 = _mm_loadu_ps(&aec->sde[i][0]);
      const __m128 vec_7654 = _mm_loadu_ps(&aec->sde[i + 2][0]);
      __m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654,
                                    _MM_SHUFFLE(2, 0, 2, 0));
      __m128 vec_b = _mm_shuffle_ps(vec_3210, vec_7654,
                                    _MM_SHUFFLE(3, 1, 3, 1));
      __m128 vec_dfwefw0011 = _mm_mul_ps(vec_dfw0, vec_efw0);
      __m128 vec_dfwefw0110 = _mm_mul_ps(vec_dfw0, vec_efw1);
      vec_a = _mm_mul_ps(vec_a, vec_GCoh0);
      vec_b = _mm_mul_ps(vec_b, vec_GCoh0);
      vec_dfwefw0011 = _mm_add_ps(vec_dfwefw0011,
                                  _mm_mul_ps(vec_dfw1, vec_efw1));
      vec_dfwefw0110 = _mm_sub_ps(vec_dfwefw0110,
                                  _mm_mul_ps(vec_dfw1, vec_efw0));
      vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwefw0011, vec_GCoh1));
      vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwefw0110, vec_GCoh1));
      _mm_storeu_ps(&aec->sde[i][0], _mm_unpacklo_ps(vec_a, vec_b));
      _mm_storeu_ps(&aec->sde[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b));
    }

    {
      const __m128 vec_3210 = _mm_loadu_ps(&aec->sxd[i][0]);
      const __m128 vec_7654 = _mm_loadu_ps(&aec->sxd[i + 2][0]);
      __m128 vec_a = _mm_shuffle_ps(vec_3210, vec_7654,
                                    _MM_SHUFFLE(2, 0, 2, 0));
      __m128 vec_b = _mm_shuffle_ps(vec_3210, vec_7654,
                                    _MM_SHUFFLE(3, 1, 3, 1));
      __m128 vec_dfwxfw0011 = _mm_mul_ps(vec_dfw0, vec_xfw0);
      __m128 vec_dfwxfw0110 = _mm_mul_ps(vec_dfw0, vec_xfw1);
      vec_a = _mm_mul_ps(vec_a, vec_GCoh0);
      vec_b = _mm_mul_ps(vec_b, vec_GCoh0);
      vec_dfwxfw0011 = _mm_add_ps(vec_dfwxfw0011,
                                  _mm_mul_ps(vec_dfw1, vec_xfw1));
      vec_dfwxfw0110 = _mm_sub_ps(vec_dfwxfw0110,
                                  _mm_mul_ps(vec_dfw1, vec_xfw0));
      vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwxfw0011, vec_GCoh1));
      vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwxfw0110, vec_GCoh1));
      _mm_storeu_ps(&aec->sxd[i][0], _mm_unpacklo_ps(vec_a, vec_b));
      _mm_storeu_ps(&aec->sxd[i + 2][0], _mm_unpackhi_ps(vec_a, vec_b));
    }

    vec_sdSum = _mm_add_ps(vec_sdSum, vec_sd);
    vec_seSum = _mm_add_ps(vec_seSum, vec_se);
  }

  _mm_add_ps_4x1(vec_sdSum, &sdSum);
  _mm_add_ps_4x1(vec_seSum, &seSum);

  for (; i < PART_LEN1; i++) {
    aec->sd[i] = ptrGCoh[0] * aec->sd[i] +
                 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);
    aec->se[i] = ptrGCoh[0] * aec->se[i] +
                 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);
    // We threshold here to protect against the ill-effects of a zero farend.
    // The threshold is not arbitrarily chosen, but balances protection and
    // adverse interaction with the algorithm's tuning.
    // TODO(bjornv): investigate further why this is so sensitive.
    aec->sx[i] =
        ptrGCoh[0] * aec->sx[i] +
        ptrGCoh[1] * WEBRTC_SPL_MAX(
            xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i],
            WebRtcAec_kMinFarendPSD);

    aec->sde[i][0] =
        ptrGCoh[0] * aec->sde[i][0] +
        ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);
    aec->sde[i][1] =
        ptrGCoh[0] * aec->sde[i][1] +
        ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);

    aec->sxd[i][0] =
        ptrGCoh[0] * aec->sxd[i][0] +
        ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]);
    aec->sxd[i][1] =
        ptrGCoh[0] * aec->sxd[i][1] +
        ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]);

    sdSum += aec->sd[i];
    seSum += aec->se[i];
  }

  // Divergent filter safeguard update.
  aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum;

  // Signal extreme filter divergence if the error is significantly larger
  // than the nearend (13 dB).
  *extreme_filter_divergence = (seSum > (19.95f * sdSum));
}

Exemplo n.º 19

0

Exibir arquivo

Arquivo: sse.c Projeto: CharoL/bioinfo-libs

void sse_matrix(int num_seqs, 
		char **q, int *q_len, int max_q_len,
		char **r, int *r_len, int max_r_len,
		float profile[128][128], float gap_open, float gap_extend,
		float *H, float *F, int *C, float *max_score) {
  
  const int depth = 4;

  __m128 h_simd, e_simd, f_simd, diagonal_simd;
  __m128 temp_simd, subst_simd;

  __m128i zeroi   = _mm_set_epi32(0, 0, 0, 0);

  __m128 score_simd = _mm_setzero_ps();
  __m128 zero_simd = _mm_setzero_ps();
  __m128 one_simd = _mm_set1_ps(1);
  __m128 gap_open_simd = _mm_set1_ps(gap_open);
  __m128 gap_extend_simd = _mm_set1_ps(gap_extend);

  __m128 max_de, max_fz;
  __m128 cmp_de, cmp_fz, cmp_de_fz;
  __m128i c;

  int offset, index, idx, j_depth;
  int q_len_depth = depth * max_q_len;
  /*
      for (int i = 0; i < 4; i++) {
        printf("query %i:%s\nref.  %i:%s\n\n", i, q[i], i, r[i]);
      }
  */
  h_simd = zero_simd;
  e_simd = zero_simd;
  
  for (int j = 0; j < max_q_len; j++) {

    j_depth = depth * j;
    
    // left value: gap in reference
    e_simd = _mm_max_ps(_mm_sub_ps(e_simd, gap_extend_simd), 
			_mm_sub_ps(h_simd, gap_open_simd));

    //    printf("from left: %0.2f\n", ((float *)&e_simd)[0]);
    
    // diagonal value: match or mismatch

    subst_simd = _mm_set_ps((q_len[3] > j) ? profile[q[3][j]][r[3][0]] : -1000.0f,
                            (q_len[2] > j) ? profile[q[2][j]][r[2][0]] : -1000.0f,
                            (q_len[1] > j) ? profile[q[1][j]][r[1][0]] : -1000.0f,
                            (q_len[0] > j) ? profile[q[0][j]][r[0][0]] : -1000.0f);
    /*
    subst_simd = _mm_set_ps(profile[q[3][j]][r[3][0]], 
			    profile[q[2][j]][r[2][0]], 
			    profile[q[1][j]][r[1][0]], 
			    profile[q[0][j]][r[0][0]]);
    */

    diagonal_simd = _mm_add_ps(zero_simd, subst_simd);
    //    printf("from diagonal: temp = %0.2f %0.2f (%c, %c) -> %0.2f\n", ((float *)&temp_simd)[0], profile[q[0][j]][r[0][0]], q[0][j], r[0][0], ((float *)&diagonal_simd)[0]);

    cmp_de = _mm_min_ps(_mm_cmpge_ps(diagonal_simd, e_simd), one_simd);
    max_de = _mm_max_ps(diagonal_simd, e_simd);

    // up value: gap in query
    f_simd = _mm_max_ps(_mm_sub_ps(zero_simd, gap_extend_simd), 
			_mm_sub_ps(zero_simd, gap_open_simd));

    cmp_fz = _mm_min_ps(_mm_cmpge_ps(f_simd, zero_simd), one_simd);
    max_fz = _mm_max_ps(f_simd, zero_simd);
    
    //    printf("from up: %0.2f\n", ((float *)&f_simd)[0]);
    
    // get max. value and save it
    cmp_de_fz = _mm_min_ps(_mm_cmpge_ps(max_de, max_fz), one_simd);
    h_simd = _mm_max_ps(max_de, max_fz);

    score_simd = _mm_max_ps(score_simd, h_simd);
    //    printf("\t\t\t\t\tmax. score: %0.2f\n", ((float *)&h_simd)[0]);
    
    // compass (save left, diagonal, up or zero?)
    c = _mm_slli_epi32(_mm_or_si128(zeroi, _mm_cvtps_epi32(cmp_de)), 1);
    c = _mm_slli_epi32(_mm_or_si128(c, _mm_cvtps_epi32(cmp_fz)), 1);
    c = _mm_or_si128(c, _mm_cvtps_epi32(cmp_de_fz));

    //    printf("\t\t\t\t\tcompass: %i\n", ((int *)&c)[0]);

    // update matrices
    _mm_store_ps(&H[j_depth], h_simd);
    _mm_store_ps(&F[j_depth], f_simd);
    _mm_store_si128((__m128i *)&C[j_depth], c);

 
    //_mm_store_ps(&D[j_depth], diagonal_simd);

    /*
    offset = j_depth;
    printf("(row, col) = (%i, %i):\t \t%c-%c=%0.2f %c-%c=%0.2f %c-%c=%0.2f %c-%c=%0.2f\n", 0, j, q[0][j], r[0][0], profile[q[0][j]][r[0][0]], q[1][j], r[1][0], profile[q[1][j]][r[1][0]], q[2][j], r[2][0], profile[q[2][j]][r[2][0]], q[3][j], r[3][0], profile[q[3][j]][r[3][0]]);
    printf("(row, col) = (%i, %i):\tH\t%0.2f %0.2f %0.2f %0.2f\n", 0, j, H[offset], H[offset+1], H[offset+2], H[offset+3]);
    printf("(row, col) = (%i, %i):\tD\t%0.2f %0.2f %0.2f %0.2f\n", 0, j, D[offset], D[offset+1], D[offset+2], D[offset+3]);
    printf("(row, col) = (%i, %i):\td\t%0.2f %0.2f %0.2f %0.2f\n", 0, j, ((float *)&diagonal_simd)[0], ((float *)&diagonal_simd)[1], ((float *)&diagonal_simd)[2], ((float *)&diagonal_simd)[3]);

    printf("(row, col) = (%i, %i):\ts\t%0.2f %0.2f %0.2f %0.2f\n", 0, j, ((float *)&subst_simd)[0], ((float *)&subst_simd)[1], ((float *)&subst_simd)[2], ((float *)&subst_simd)[3]);
    */
  }
  //  printf("\n");

  //  exit(-1);
  int target = 0;
  for (int i = 1; i < max_r_len; i++) {
    
    h_simd = zero_simd;
    e_simd = zero_simd;
    temp_simd = zero_simd;

    idx = i * q_len_depth;

    for (int j = 0; j < max_q_len; j++) {
      j_depth = depth * j;
      offset = idx + j_depth;

      // left value: gap in reference
      e_simd = _mm_max_ps(_mm_sub_ps(e_simd, gap_extend_simd), 
			  _mm_sub_ps(h_simd, gap_open_simd));
 
      //      if (i == 3 && j == 3) printf("from left: %0.2f\n", ((float *)&e_simd)[target]);

      // diagonal value: match or mismatch
      diagonal_simd = _mm_add_ps(temp_simd,
				 _mm_set_ps((q_len[3] > j && r_len[3] > i) ? profile[q[3][j]][r[3][i]] : -1000.0f, 
					    (q_len[2] > j && r_len[2] > i) ? profile[q[2][j]][r[2][i]] : -1000.0f, 
					    (q_len[1] > j && r_len[1] > i) ? profile[q[1][j]][r[1][i]] : -1000.0f, 
					    (q_len[0] > j && r_len[0] > i) ? profile[q[0][j]][r[0][i]] : -1000.0f)
				 );

      cmp_de = _mm_min_ps(_mm_cmpge_ps(diagonal_simd, e_simd), one_simd);
      max_de = _mm_max_ps(diagonal_simd, e_simd);

      //      if (i == 3 && j == 3)	printf("from diagonal: temp = %0.2f %0.2f (%c, %c) -> %0.2f\n", ((float *)&temp_simd)[target], profile[q[target][j]][r[target][i]], q[target][j], r[target][i], ((float *)&diagonal_simd)[target]);
      
      // up value: gap in query
      temp_simd = _mm_load_ps(&H[offset - q_len_depth]);

      f_simd = _mm_load_ps(&F[j_depth]);
      f_simd = _mm_max_ps(_mm_sub_ps(f_simd, gap_extend_simd), 
			  _mm_sub_ps(temp_simd, gap_open_simd));

      cmp_fz = _mm_min_ps(_mm_cmpge_ps(f_simd, zero_simd), one_simd);
      max_fz = _mm_max_ps(f_simd, zero_simd);

      //      if (i == 3 && j == 3) printf("from up: %0.2f\n", ((float *)&f_simd)[target]);

      // get max. value
      cmp_de_fz = _mm_min_ps(_mm_cmpge_ps(max_de, max_fz), one_simd);
      h_simd = _mm_max_ps(max_de, max_fz);

      score_simd = _mm_max_ps(score_simd, h_simd);

      //      if (i == 3 && j == 3) printf("\t\t\t\t\tmax. score: %0.2f\n", ((float *)&h_simd)[target]);

      // compass (save left, diagonal, up or zero?)
      c = _mm_slli_epi32(_mm_or_si128(zeroi, _mm_cvtps_epi32(cmp_de)), 1);
      c = _mm_slli_epi32(_mm_or_si128(c, _mm_cvtps_epi32(cmp_fz)), 1);
      c = _mm_or_si128(c, _mm_cvtps_epi32(cmp_de_fz));

      // update matrices
      _mm_store_ps(&H[offset], h_simd);
      _mm_store_ps(&F[j_depth], f_simd); 
      _mm_store_si128((__m128i *)&C[offset], c);

      /*
      if (j==0) {
	printf("(row, col) = (%i, %i):\tD\t%0.2f %0.2f %0.2f %0.2f\n", i, j, D[offset], D[offset+1], D[offset+2], D[offset+3]);
	printf("(row, col) = (%i, %i):\tH\t%0.2f %0.2f %0.2f %0.2f\n", i, j, H[offset], H[offset+1], H[offset+2], H[offset+3]);
      }
      */
      //      printf("(row, col) = (%i, %i):\t%0.2f %0.2f %0.2f %0.2f\n", i, j, H[offset], H[offset+1], H[offset+2], H[offset+3]);
    }
    //    printf("\n");
  }
  _mm_store_ps(max_score, score_simd);
  
  /*

  int rr_len = r_len[0];
  int qq_len = q_len[0];
  printf("r_len[0] = %i, q_len[0] = %i\n", rr_len, qq_len);

  printf("sse\n");
  for (int i = 0; i < rr_len; i++) {
    printf("\t");
    for (int j = 0; j < qq_len; j++) {
      printf("%0.2f\t", H[(i * max_q_len * 4) + (j * 4)]);
    }
    printf("\n");
  }
  */
  /*
  char filename[200];
  for (int i = 0; i < 4; i++) {
    sprintf(filename, "/tmp/sse1-%i.score", i);
    save_float_matrix(H, max_q_len, max_r_len, q[i], q_len[i], r[i], r_len[i], i, 4, filename);
  }
  */
  /*
      for (int i = 0; i < 4; i++) {
        printf("score %i:%0.2f\n\n", i, max_score[i]);
      }
  */
}

Exemplo n.º 20

0

Exibir arquivo

Arquivo: convert_rgb.c Projeto: shootthemoonfilms/dcraw-fast

void convert_to_rgb_fast()
{
    unsigned i,j,c;
    int row, col, k;
    ushort *img;
    float out_cam[3][4];
    double num, inverse[3][3];
    static const double xyzd50_srgb[3][3] =
    { { 0.436083, 0.385083, 0.143055 },
      { 0.222507, 0.716888, 0.060608 },
      { 0.013930, 0.097097, 0.714022 } };
    static const double rgb_rgb[3][3] =
    { { 1,0,0 }, { 0,1,0 }, { 0,0,1 } };
    static const double adobe_rgb[3][3] =
    { { 0.715146, 0.284856, 0.000000 },
      { 0.000000, 1.000000, 0.000000 },
      { 0.000000, 0.041166, 0.958839 } };
    static const double wide_rgb[3][3] =
    { { 0.593087, 0.404710, 0.002206 },
      { 0.095413, 0.843149, 0.061439 },
      { 0.011621, 0.069091, 0.919288 } };
    static const double prophoto_rgb[3][3] =
    { { 0.529317, 0.330092, 0.140588 },
      { 0.098368, 0.873465, 0.028169 },
      { 0.016879, 0.117663, 0.865457 } };
    static const double (*out_rgb[])[3] =
    { rgb_rgb, adobe_rgb, wide_rgb, prophoto_rgb, xyz_rgb };
    static const char *name[] =
    { "sRGB", "Adobe RGB (1998)", "WideGamut D65", "ProPhoto D65", "XYZ" };
    static const unsigned phead[] =
    { 1024, 0, 0x2100000, 0x6d6e7472, 0x52474220, 0x58595a20, 0, 0, 0,
      0x61637370, 0, 0, 0x6e6f6e65, 0, 0, 0, 0, 0xf6d6, 0x10000, 0xd32d };
    unsigned pbody[] =
    { 10, 0x63707274, 0, 36, /* cprt */
      0x64657363, 0, 40, /* desc */
      0x77747074, 0, 20, /* wtpt */
      0x626b7074, 0, 20, /* bkpt */
      0x72545243, 0, 14, /* rTRC */
      0x67545243, 0, 14, /* gTRC */
      0x62545243, 0, 14, /* bTRC */
      0x7258595a, 0, 20, /* rXYZ */
      0x6758595a, 0, 20, /* gXYZ */
      0x6258595a, 0, 20 }; /* bXYZ */
    static const unsigned pwhite[] = { 0xf351, 0x10000, 0x116cc };
    unsigned pcurve[] = { 0x63757276, 0, 1, 0x1000000 };

    gamma_curve (gamm[0], gamm[1], 0, 0);
    memcpy (out_cam, rgb_cam, sizeof out_cam);
    raw_color |= colors == 1 || document_mode ||
                 output_color < 1 || output_color > 5;
    if (!raw_color) {
        oprof = (unsigned *) calloc (phead[0], 1);
        merror (oprof, "convert_to_rgb()");
        memcpy (oprof, phead, sizeof phead);
        if (output_color == 5) oprof[4] = oprof[5];
        oprof[0] = 132 + 12*pbody[0];
        for (i=0; i < pbody[0]; i++) {
            oprof[oprof[0]/4] = i ? (i > 1 ? 0x58595a20 : 0x64657363) : 0x74657874;
            pbody[i*3+2] = oprof[0];
            oprof[0] += (pbody[i*3+3] + 3) & -4;
        }
        memcpy (oprof+32, pbody, sizeof pbody);
        oprof[pbody[5]/4+2] = strlen(name[output_color-1]) + 1;
        memcpy ((char *)oprof+pbody[8]+8, pwhite, sizeof pwhite);
        pcurve[3] = (short)(256/gamm[5]+0.5) << 16;
        for (i=4; i < 7; i++)
            memcpy ((char *)oprof+pbody[i*3+2], pcurve, sizeof pcurve);
        pseudoinverse ((double (*)[3])out_rgb[output_color-1], inverse, 3);
        for (i=0; i < 3; i++)
            for (j=0; j < 3; j++) {
                for (num = k=0; k < 3; k++)
                    num += xyzd50_srgb[i][k] * inverse[j][k];
                oprof[pbody[j*3+23]/4+i+2] = num * 0x10000 + 0.5;
            }
        for (i=0; i < phead[0]/4; i++)
            oprof[i] = htonl(oprof[i]);
        strcpy ((char *)oprof+pbody[2]+8, "auto-generated by dcraw");
        strcpy ((char *)oprof+pbody[5]+12, name[output_color-1]);
        for (i=0; i < 3; i++)
            for (j=0; j < colors; j++)
                for (out_cam[i][j] = k=0; k < 3; k++)
                    out_cam[i][j] += out_rgb[output_color-1][i][k] * rgb_cam[k][j];
    }
    if (verbose)
        fprintf (stderr, raw_color ? _("Building histograms...\n") :
                 _("Converting to %s colorspace...\n"), name[output_color-1]);

    memset (histogram, 0, sizeof histogram);
    if(!raw_color) {
        __m128 outcam0= {out_cam[0][0],out_cam[1][0],out_cam[2][0],0},
               outcam1= {out_cam[0][1],out_cam[1][1],out_cam[2][1],0},
               outcam2= {out_cam[0][2],out_cam[1][2],out_cam[2][2],0},
               outcam3= {out_cam[0][3],out_cam[1][3],out_cam[2][3],0};
        for (img=image[0]; img < image[width*height]; img+=4) {
            __m128 out0;
            __m128 vimg0 = {img[0],img[0],img[0],0},
                   vimg1 = {img[1],img[1],img[1],0},
                   vimg2 = {img[2],img[2],img[2],0},
                   vimg3 = {img[3],img[3],img[3],0};

//             out[0] = out_cam[0][0] * img[0]
//                     +out_cam[0][1] * img[1]
//                     +out_cam[0][2] * img[2]
//                     +out_cam[0][3] * img[3];
//             out[1] = out_cam[1][0] * img[0]
//                     +out_cam[1][1] * img[1]
//                     +out_cam[1][2] * img[2]
//                     +out_cam[1][3] * img[3];
//             out[2] = out_cam[2][0] * img[0]
//                     +out_cam[2][1] * img[1]
//                     +out_cam[2][2] * img[2]
//                     +out_cam[2][3] * img[3];
            out0 = _mm_add_ps(_mm_add_ps(
                     _mm_mul_ps(vimg0, outcam0),
                     _mm_mul_ps(vimg1, outcam1)
                   ), _mm_add_ps(
                     _mm_mul_ps(vimg2, outcam2),
                     _mm_mul_ps(vimg3, outcam3)
                   ));
            //clip
            out0 = _mm_max_ps(_mm_set1_ps(0), _mm_min_ps(_mm_set1_ps(0xffff), _mm_round_ps(out0, _MM_FROUND_TO_ZERO)));
            __m128i o = _mm_cvtps_epi32(out0);
            o = _mm_packus_epi32(o,_mm_setzero_si128());
            memcpy(img, &o, sizeof(short)*3);

            FORCC histogram[c][img[c] >> 3]++;
        }

    } else if (document_mode) {

Exemplo n.º 21

0

Exibir arquivo

Arquivo: vector.hpp Projeto: WoLpH/raytracer

inline vec4 max(vec4 a, vec4 b) { return _mm_max_ps(a, b); }

Exemplo n.º 22

0

Exibir arquivo

Arquivo: CDLOpCPU.cpp Projeto: KevinJW/OpenColorIO

inline void ApplyClamp(__m128& pix)
{
    pix = _mm_min_ps(_mm_max_ps(pix, EZERO), EONE);
}

Exemplo n.º 23

0

Exibir arquivo

Arquivo: imgdiff_main.cpp Projeto: daseyb/imgdiff_bindings

  _declspec(dllexport) DiffResult __stdcall diff_img(Image left, Image right, DiffOptions options) {
    if (options.ignoreColor) {
      makeGreyscale(left);
      makeGreyscale(right);
    }

    float* imgMem = (float*)_aligned_malloc(left.width * left.height * sizeof(float) * 4, 16);
    int colorOffset = left.width * left.height;
    Image diff = { left.width, left.height, left.stride, imgMem, imgMem + colorOffset, imgMem + colorOffset * 2, imgMem + colorOffset * 3 };

    float* drp = diff.r;
    float* dgp = diff.g;
    float* dbp = diff.b;
    float* dap = diff.a;

    float* lrp = left.r;
    float* lgp = left.g;
    float* lbp = left.b;
    float* lap = left.a;

    float* rrp = right.r;
    float* rgp = right.g;
    float* rbp = right.b;
    float* rap = right.a;

    Color error = ConvertToFloat(options.errorColor);

    auto er = _mm_set_ps1(error.r);
    auto eg = _mm_set_ps1(error.g);
    auto eb = _mm_set_ps1(error.b);
    auto ea = _mm_set_ps1(error.a);

    auto tolerance = _mm_set_ps1(options.tolerance);
    auto overlayTransparency = _mm_set_ps1(options.overlayTransparency);

    OverlayType overlayType = options.overlayType;
    byte weightByDiffPercentage = options.weightByDiffPercentage;

    auto diffPixelCount = _mm_set_epi32(0, 0, 0, 0);
    auto onei = _mm_set1_epi32(1);
    auto one = _mm_set1_ps(1);
    auto zero = _mm_set1_ps(0);

    for (int y = 0; y < left.height; y++) {
      for (int x = 0; x < left.width; x+=4) {
        auto lr = _mm_load_ps(lrp);
        auto lg = _mm_load_ps(lgp);
        auto lb = _mm_load_ps(lbp);
        auto la = _mm_load_ps(lap);

        auto rr = _mm_load_ps(rrp);
        auto rg = _mm_load_ps(rgp);
        auto rb = _mm_load_ps(rbp);
        auto ra = _mm_load_ps(rap);

        auto rdiff = _mm_sub_ps(rr, lr);
        auto gdiff = _mm_sub_ps(rg, lg);
        auto bdiff = _mm_sub_ps(rb, lb);
        auto adiff = _mm_sub_ps(ra, la);

        auto distance = _mm_mul_ps(rdiff, rdiff);
        distance = _mm_add_ps(distance, _mm_mul_ps(gdiff, gdiff));
        distance = _mm_add_ps(distance, _mm_mul_ps(bdiff, bdiff));
        distance = _mm_add_ps(distance, _mm_mul_ps(adiff, adiff));
        distance = _mm_sqrt_ps(distance);

        auto t = overlayTransparency;
        if (weightByDiffPercentage) {
          t = _mm_mul_ps(t, distance);
        }

        auto isdiff = _mm_cmpgt_ps(distance, tolerance);

        t = _mm_min_ps(one, _mm_max_ps(zero, t));
        auto mlr = rr;
        auto mlg = rg;
        auto mlb = rb;
        auto mla = ra;

        if (overlayType == OverlayType::Movement) {
          mlr = _mm_mul_ps(mlr, er);
          mlg = _mm_mul_ps(mlg, eg);
          mlb = _mm_mul_ps(mlb, eb);
          mla = _mm_mul_ps(mla, ea);
        }

        auto oneMinusT = _mm_sub_ps(one, t);

        auto mixedR = _mm_add_ps(_mm_mul_ps(mlr, oneMinusT), _mm_mul_ps(er, t));
        auto mixedG = _mm_add_ps(_mm_mul_ps(mlg, oneMinusT), _mm_mul_ps(eg, t));
        auto mixedB = _mm_add_ps(_mm_mul_ps(mlb, oneMinusT), _mm_mul_ps(eb, t));
        auto mixedA = one;

        if (overlayType != OverlayType::Movement) {
          mixedA = _mm_add_ps(_mm_mul_ps(mla, oneMinusT), _mm_mul_ps(ea, t));
        }

        // (((b ^ a) & mask)^a)
        auto dr = _mm_xor_ps(lr, _mm_and_ps(isdiff, _mm_xor_ps(mixedR, lr)));
        auto dg = _mm_xor_ps(lg, _mm_and_ps(isdiff, _mm_xor_ps(mixedG, lg)));
        auto db = _mm_xor_ps(lb, _mm_and_ps(isdiff, _mm_xor_ps(mixedB, lb)));
        auto da = _mm_xor_ps(la, _mm_and_ps(isdiff, _mm_xor_ps(mixedA, la)));

        diffPixelCount = _mm_xor_si128(diffPixelCount,
          _mm_and_si128(_mm_castps_si128(isdiff),
            _mm_xor_si128(_mm_add_epi32(diffPixelCount, onei),
              diffPixelCount)));

        _mm_store_ps(drp, dr);
        _mm_store_ps(dgp, dg);
        _mm_store_ps(dbp, db);
        _mm_store_ps(dap, da);

        drp+=4;
        dgp+=4;
        dbp+=4;
        dap+=4;

        lrp+=4;
        lgp+=4;
        lbp+=4;
        lap+=4;

        rrp+=4;
        rgp+=4;
        rbp+=4;
        rap+=4;
      }
    }

    int* pixelCounts = (int*)_aligned_malloc(4 * sizeof(int), 16);
    _mm_store_si128((__m128i*)pixelCounts, diffPixelCount);

    int totalCount = pixelCounts[0] + pixelCounts[1] + pixelCounts[2] + pixelCounts[3];
    _aligned_free(pixelCounts);

    return{ diff, 1.0f - float(totalCount) / (left.height * left.width - left.height * left.stride) };
  }

Exemplo n.º 24

0

Exibir arquivo

Arquivo: conversion.cpp Projeto: jabernet/YCbCr2RGB

void ConvertVideoFrame420ToRGB(
	const th_info *tinfo,
	const th_ycbcr_buffer ycbcr,
	unsigned char* pixels
)
{
	// some constant definitions 

	const float single_yoffset = 16.0f;
	const float single_yexcursion = 219.0f;
	const float single_cboffset = 128.0f;
	const float single_cbexcursion = 224.0f;
	const float single_croffset = 128.0f;
	const float single_crexcursion = 224.0f;

	const float kr = 0.299f;
	const float kb = 0.114f;

	if (pixels)
	{
		const th_img_plane yplane = ycbcr[0];
		const th_img_plane cbplane = ycbcr[1];
		const th_img_plane crplane = ycbcr[2];

		const int width = tinfo->pic_width;
		const int height = tinfo->pic_height;
		const int wh = width*height;
		
		assert(wh == yplane.width*yplane.height);
		assert(width % 16 == 0);
		assert(cbplane.width * 2 == yplane.width);
		assert(crplane.width * 2 == yplane.width);

		const unsigned char* ydata = yplane.data;
		const unsigned char* cbdata = cbplane.data;
		const unsigned char* crdata = crplane.data;

		const int ystride = yplane.stride;
		const int cbstride = cbplane.stride;
		const int crstride = crplane.stride;

		const __m128 yoffset = _mm_set_ps1(-single_yoffset);
		const __m128 yexcursion = _mm_set_ps1(1.0f / single_yexcursion);
		const __m128 cboffset = _mm_set_ps1(-single_cboffset);
		const __m128 cbexcursion = _mm_set_ps1(1.0f / single_cbexcursion);
		const __m128 croffset = _mm_set_ps1(-single_croffset);
		const __m128 crexcursion = _mm_set_ps1(1.0f / single_crexcursion);

		const __m128 fr = _mm_set_ps1(255.0f * 2 * (1 - kr));
		const __m128 fb = _mm_set_ps1(255.0f * 2 * (1 - kb));

		const __m128 f1 = _mm_set_ps1(255.0f * (2 * (1 - kb) * kb / (1 - kb - kr)));
		const __m128 f2 = _mm_set_ps1(255.0f * (2 * (1 - kr) * kr / (1 - kb - kr)));

		const __m128 c255 = _mm_set_ps1(255.0f);

		for(int h = 0; h < height; ++h)
		{
			for(int w = 0; w < width; w += 16)
			{
				const __m128i yIn = _mm_loadu_si128((const __m128i*)(ydata + h*ystride + w));
				// assumption is that there is only one pixel in the cb/cr plane per 4 pixels (2x2) in the y plane
				const __m128i cbIn = _mm_loadu_si128((const __m128i*)(cbdata + h/2*cbstride + w/2));
				const __m128i crIn = _mm_loadu_si128((const __m128i*)(crdata + h/2*crstride + w/2));

				// yIn ep8 -> ps

				const __m128i yInlo = _mm_unpacklo_epi8((yIn), _mm_setzero_si128());
				const __m128i yInHi = _mm_unpackhi_epi8((yIn), _mm_setzero_si128());
				const __m128i yIn1 = _mm_unpacklo_epi16(yInlo, _mm_setzero_si128());
				const __m128i yIn4 = _mm_unpackhi_epi16(yInlo, _mm_setzero_si128());
				const __m128i yIn8 = _mm_unpacklo_epi16(yInHi, _mm_setzero_si128());
				const __m128i yIn12 = _mm_unpackhi_epi16(yInHi, _mm_setzero_si128());

				const __m128 yIn1ps = _mm_cvtepi32_ps(yIn1);
				const __m128 yIn2ps = _mm_cvtepi32_ps(yIn4);
				const __m128 yIn3ps = _mm_cvtepi32_ps(yIn8);
				const __m128 yIn4ps = _mm_cvtepi32_ps(yIn12);

				// cbIn ep8 -> ps

				const __m128i cbInExp = _mm_unpacklo_epi8(cbIn, cbIn);
				const __m128i cbInlo = _mm_unpacklo_epi8(cbInExp, _mm_setzero_si128());
				const __m128i cbInHi = _mm_unpackhi_epi8(cbInExp, _mm_setzero_si128());
				const __m128i cbIn1 = _mm_unpacklo_epi16(cbInlo, _mm_setzero_si128());
				const __m128i cbIn4 = _mm_unpackhi_epi16(cbInlo, _mm_setzero_si128());
				const __m128i cbIn8 = _mm_unpacklo_epi16(cbInHi, _mm_setzero_si128());
				const __m128i cbIn12 = _mm_unpackhi_epi16(cbInHi, _mm_setzero_si128());

				const __m128 cbIn1ps = _mm_cvtepi32_ps(cbIn1);
				const __m128 cbIn2ps = _mm_cvtepi32_ps(cbIn4);
				const __m128 cbIn3ps = _mm_cvtepi32_ps(cbIn8);
				const __m128 cbIn4ps = _mm_cvtepi32_ps(cbIn12);

				// crIn ep8 -> ps

				const __m128i crInExp = _mm_unpacklo_epi8(crIn, crIn);
				const __m128i crInlo = _mm_unpacklo_epi8(crInExp, _mm_setzero_si128());
				const __m128i crInHi = _mm_unpackhi_epi8(crInExp, _mm_setzero_si128());
				const __m128i crIn1 = _mm_unpacklo_epi16(crInlo, _mm_setzero_si128());
				const __m128i crIn4 = _mm_unpackhi_epi16(crInlo, _mm_setzero_si128());
				const __m128i crIn8 = _mm_unpacklo_epi16(crInHi, _mm_setzero_si128());
				const __m128i crIn12 = _mm_unpackhi_epi16(crInHi, _mm_setzero_si128());

				const __m128 crIn1ps = _mm_cvtepi32_ps(crIn1);
				const __m128 crIn2ps = _mm_cvtepi32_ps(crIn4);
				const __m128 crIn3ps = _mm_cvtepi32_ps(crIn8);
				const __m128 crIn4ps = _mm_cvtepi32_ps(crIn12);

				// map [0..255] to [-1/2..+1/2] resp. [0..1]
			
				const __m128 yOut1ps = _mm_mul_ps(_mm_add_ps(yIn1ps, yoffset), yexcursion);
				const __m128 yOut2ps = _mm_mul_ps(_mm_add_ps(yIn2ps, yoffset), yexcursion);
				const __m128 yOut3ps = _mm_mul_ps(_mm_add_ps(yIn3ps, yoffset), yexcursion);
				const __m128 yOut4ps = _mm_mul_ps(_mm_add_ps(yIn4ps, yoffset), yexcursion);

				const __m128 cbOut1ps = _mm_mul_ps(_mm_add_ps(cbIn1ps, cboffset), cbexcursion);
				const __m128 cbOut2ps = _mm_mul_ps(_mm_add_ps(cbIn2ps, cboffset), cbexcursion);
				const __m128 cbOut3ps = _mm_mul_ps(_mm_add_ps(cbIn3ps, cboffset), cbexcursion);
				const __m128 cbOut4ps = _mm_mul_ps(_mm_add_ps(cbIn4ps, cboffset), cbexcursion);

				const __m128 crOut1ps = _mm_mul_ps(_mm_add_ps(crIn1ps, croffset), crexcursion);
				const __m128 crOut2ps = _mm_mul_ps(_mm_add_ps(crIn2ps, croffset), crexcursion);
				const __m128 crOut3ps = _mm_mul_ps(_mm_add_ps(crIn3ps, croffset), crexcursion);
				const __m128 crOut4ps = _mm_mul_ps(_mm_add_ps(crIn4ps, croffset), crexcursion);

				// do the actual conversion math (on range 0..255/-127..127 instead or 0..1/-1/2..+1/2

				const __m128 y1_255 = _mm_mul_ps(c255, yOut1ps);
				const __m128 y2_255 = _mm_mul_ps(c255, yOut2ps);
				const __m128 y3_255 = _mm_mul_ps(c255, yOut3ps);
				const __m128 y4_255 = _mm_mul_ps(c255, yOut4ps);

				const __m128 r1_1 = _mm_add_ps(y1_255, _mm_mul_ps(fr, crOut1ps));
				const __m128 r2_1 = _mm_add_ps(y2_255, _mm_mul_ps(fr, crOut2ps));
				const __m128 r3_1 = _mm_add_ps(y3_255, _mm_mul_ps(fr, crOut3ps));
				const __m128 r4_1 = _mm_add_ps(y4_255, _mm_mul_ps(fr, crOut4ps));

				const __m128 g1_1 = _mm_sub_ps(_mm_sub_ps(y1_255, _mm_mul_ps(f1, cbOut1ps)), _mm_mul_ps(f2, crOut1ps));
				const __m128 g2_1 = _mm_sub_ps(_mm_sub_ps(y2_255, _mm_mul_ps(f1, cbOut2ps)), _mm_mul_ps(f2, crOut2ps));
				const __m128 g3_1 = _mm_sub_ps(_mm_sub_ps(y3_255, _mm_mul_ps(f1, cbOut3ps)), _mm_mul_ps(f2, crOut3ps));
				const __m128 g4_1 = _mm_sub_ps(_mm_sub_ps(y4_255, _mm_mul_ps(f1, cbOut4ps)), _mm_mul_ps(f2, crOut4ps));

				const __m128 b1_1 = _mm_add_ps(y1_255, _mm_mul_ps(fb, cbOut1ps));
				const __m128 b2_1 = _mm_add_ps(y2_255, _mm_mul_ps(fb, cbOut2ps));
				const __m128 b3_1 = _mm_add_ps(y3_255, _mm_mul_ps(fb, cbOut3ps));
				const __m128 b4_1 = _mm_add_ps(y4_255, _mm_mul_ps(fb, cbOut4ps));

				// clip to 255
				const __m128 r1 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, r1_1));
				const __m128 r2 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, r2_1));
				const __m128 r3 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, r3_1));
				const __m128 r4 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, r4_1));

				const __m128 g1 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, g1_1));
				const __m128 g2 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, g2_1));
				const __m128 g3 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, g3_1));
				const __m128 g4 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, g4_1));

				const __m128 b1 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, b1_1));
				const __m128 b2 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, b2_1));
				const __m128 b3 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, b3_1));
				const __m128 b4 = _mm_max_ps(_mm_setzero_ps(), _mm_min_ps(c255, b4_1));

				// multiplex rgb channels

#define rgb_multiplex(no) \
				const __m128 rgb##no##_1 = _mm_shuffle_ps( \
					_mm_shuffle_ps(b##no##, g##no##, _MM_SHUFFLE(0, 0, 0, 0)),  \
					_mm_shuffle_ps(r##no##, b##no##, _MM_SHUFFLE(1, 1, 0, 0)),  \
					_MM_SHUFFLE(2, 0, 2, 0)); \
				const __m128 rgb##no##_2 = _mm_shuffle_ps( \
					_mm_shuffle_ps(g##no##, r##no##, _MM_SHUFFLE(1, 1, 1, 1)),  \
					_mm_shuffle_ps(b##no##, g##no##, _MM_SHUFFLE(2, 2, 2, 2)),  \
					_MM_SHUFFLE(2, 0, 2, 0)); \
				const __m128 rgb##no##_3 = _mm_shuffle_ps( \
					_mm_shuffle_ps(r##no##, b##no##, _MM_SHUFFLE(3, 3, 2, 2)),  \
					_mm_shuffle_ps(g##no##, r##no##, _MM_SHUFFLE(3, 3, 3, 3)),  \
					_MM_SHUFFLE(2, 0, 2, 0)); 

				rgb_multiplex(1);
				rgb_multiplex(2);
				rgb_multiplex(3);
				rgb_multiplex(4);

#undef rgb_multiplex

				// pack 32bit -> 8bit

				const __m128i pack1l = _mm_packs_epi32(_mm_cvtps_epi32(rgb1_1), _mm_cvtps_epi32(rgb1_2));
				const __m128i pack1h = _mm_packs_epi32(_mm_cvtps_epi32(rgb1_3), _mm_cvtps_epi32(rgb2_1));
				const __m128i pack1 = _mm_packus_epi16(pack1l, pack1h);

				const __m128i pack2l = _mm_packs_epi32(_mm_cvtps_epi32(rgb2_2), _mm_cvtps_epi32(rgb2_3));
				const __m128i pack2h = _mm_packs_epi32(_mm_cvtps_epi32(rgb3_1), _mm_cvtps_epi32(rgb3_2));
				const __m128i pack2 = _mm_packus_epi16(pack2l, pack2h);

				const __m128i pack3l = _mm_packs_epi32(_mm_cvtps_epi32(rgb3_3), _mm_cvtps_epi32(rgb4_1));
				const __m128i pack3h = _mm_packs_epi32(_mm_cvtps_epi32(rgb4_2), _mm_cvtps_epi32(rgb4_3));
				const __m128i pack3 = _mm_packus_epi16(pack3l, pack3h);

				// and finally store in output

				_mm_storeu_si128((__m128i*)(pixels + ((wh-width)*3) - h*width*3 + w*3 + 0*16), pack1);
				_mm_storeu_si128((__m128i*)(pixels + ((wh-width)*3) - h*width*3 + w*3 + 1*16), pack2);
				_mm_storeu_si128((__m128i*)(pixels + ((wh-width)*3) - h*width*3 + w*3 + 2*16), pack3);
			}
		}

	} // if
}

Exemplo n.º 25

0

Exibir arquivo

Arquivo: sse.hpp Projeto: bobbyluig/Eclipse

RETf MAX_SSE(const __m128 x, const __m128 y) { return _mm_max_ps(x, y); }

Exemplo n.º 26

0

Exibir arquivo

Arquivo: dcraw_ahdfast.c Projeto: shootthemoonfilms/dcraw-fast

static __m128i cielabv (union hvrgbpix rgb)
{
    __m128 xvxyz[2] = {_mm_set1_ps(0.5),_mm_set1_ps(0.5) }; //,0.5,0.5,0.5);

    __m128 vcam0 = _mm_setr_ps(cielab_xyz_cam[0][0],cielab_xyz_cam[1][0],cielab_xyz_cam[2][0],0);
    __m128 vcam1 = _mm_setr_ps(cielab_xyz_cam[0][1],cielab_xyz_cam[1][1],cielab_xyz_cam[2][1],0);
    __m128 vcam2 = _mm_setr_ps(cielab_xyz_cam[0][2],cielab_xyz_cam[1][2],cielab_xyz_cam[2][2],0);
    __m128 vrgb0h = _mm_set1_ps(rgb.h.c[0]);
    __m128 vrgb1h = _mm_set1_ps(rgb.h.c[1]);
    __m128 vrgb2h = _mm_set1_ps(rgb.h.c[2]);
    __m128 vrgb0v = _mm_set1_ps(rgb.v.c[0]);
    __m128 vrgb1v = _mm_set1_ps(rgb.v.c[1]);
    __m128 vrgb2v = _mm_set1_ps(rgb.v.c[2]);

    xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam0,vrgb0h));
    xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam1,vrgb1h));
    xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam2,vrgb2h));
    xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam0,vrgb0v));
    xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam1,vrgb1v));
    xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam2,vrgb2v));

    xvxyz[0] = _mm_max_ps(_mm_set1_ps(0),
                          _mm_min_ps(_mm_set1_ps(0xffff),
                                     _mm_round_ps(xvxyz[0], _MM_FROUND_TO_ZERO)));
    xvxyz[1] = _mm_max_ps(_mm_set1_ps(0),
                          _mm_min_ps(_mm_set1_ps(0xffff),
                                     _mm_round_ps(xvxyz[1], _MM_FROUND_TO_ZERO)));
    __m128i loadaddrh = _mm_cvttps_epi32(xvxyz[0]);
    __m128i loadaddrv = _mm_cvttps_epi32(xvxyz[1]);
#ifdef __AVX__
    __m256 vlab,
           vxyz = { cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrh,0)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                    0,
                    cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrv,0)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                    0},
           vxyz2 =  {0,
                     cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrh,2)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrh,0)],
                     0,
                     cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrv,2)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrv,0)]};

    vlab = _mm256_sub_ps(vxyz,vxyz2);
    vlab = _mm256_mul_ps(vlab, _mm256_setr_ps(116,500,200,0,116,500,200,0));
    vlab = _mm256_sub_ps(vlab, _mm256_setr_ps(16,0,0,0,16,0,0,0));
    vlab = _mm256_mul_ps(vlab,_mm256_set1_ps(64));
    vlab = _mm256_round_ps(vlab, _MM_FROUND_TO_ZERO);
    __m256i vlabi = _mm256_cvtps_epi32(vlab);
    return _mm_packs_epi32(_mm256_castsi256_si128(vlabi), ((__m128i*)&vlabi)[1]);
#else
    __m128 vlabh, vxyzh = {cielab_cbrt[_mm_extract_epi32(loadaddrh,0)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrh,2)],
                           0};
    __m128 vlabv, vxyzv = {cielab_cbrt[_mm_extract_epi32(loadaddrv,0)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrv,2)],
                           0};

    vlabh = _mm_sub_ps(_mm_shuffle_ps(vxyzh,vxyzh,_MM_SHUFFLE(0,1,0,1)),
                       _mm_shuffle_ps(vxyzh,vxyzh,_MM_SHUFFLE(0,2,1,3)));
    vlabh = _mm_mul_ps(vlabh,_mm_setr_ps(116,500,200,0));
    vlabh = _mm_sub_ps(vlabh,_mm_setr_ps(16,0,0,0));
    vlabh = _mm_mul_ps(vlabh,_mm_set_ps1(64));
    vlabh = _mm_round_ps(vlabh, _MM_FROUND_TO_ZERO);

    vlabv = _mm_sub_ps(_mm_shuffle_ps(vxyzv,vxyzv,_MM_SHUFFLE(0,1,0,1)),
                       _mm_shuffle_ps(vxyzv,vxyzv,_MM_SHUFFLE(0,2,1,3)));
    vlabv = _mm_mul_ps(vlabv,_mm_setr_ps(116,500,200,0));
    vlabv = _mm_sub_ps(vlabv,_mm_setr_ps(16,0,0,0));
    vlabv = _mm_mul_ps(vlabv,_mm_set_ps1(64));
    vlabv = _mm_round_ps(vlabv, _MM_FROUND_TO_ZERO);

    return _mm_set_epi64(_mm_cvtps_pi16(vlabv),_mm_cvtps_pi16(vlabh));
#endif
}

Exemplo n.º 27

0

Exibir arquivo

Arquivo: AmdtpTransmitStreamProcessor.cpp Projeto: EMATech/ffado

/**
 * @brief mux all audio ports to events
 * @param data 
 * @param offset 
 * @param nevents 
 */
void
AmdtpTransmitStreamProcessor::encodeAudioPortsFloat(quadlet_t *data,
                                                    unsigned int offset,
                                                    unsigned int nevents)
{
    unsigned int j;
    quadlet_t *target_event;
    int i;

    float * client_buffers[4];
    float tmp_values[4] __attribute__ ((aligned (16)));
    uint32_t tmp_values_int[4] __attribute__ ((aligned (16)));

    // prepare the scratch buffer
    assert(m_scratch_buffer_size_bytes > nevents * 4);
    memset(m_scratch_buffer, 0, nevents * 4);

    const __m128i label = _mm_set_epi32 (0x40000000, 0x40000000, 0x40000000, 0x40000000);
    const __m128i mask = _mm_set_epi32 (0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF);
    const __m128 mult = _mm_set_ps(AMDTP_FLOAT_MULTIPLIER, AMDTP_FLOAT_MULTIPLIER, AMDTP_FLOAT_MULTIPLIER, AMDTP_FLOAT_MULTIPLIER);

#if AMDTP_CLIP_FLOATS
    const __m128 v_max = _mm_set_ps(1.0, 1.0, 1.0, 1.0);
    const __m128 v_min = _mm_set_ps(-1.0, -1.0, -1.0, -1.0);
#endif

    // this assumes that audio ports are sorted by position,
    // and that there are no gaps
    for (i = 0; i < ((int)m_nb_audio_ports)-4; i += 4) {
        struct _MBLA_port_cache *p;

        // get the port buffers
        for (j=0; j<4; j++) {
            p = &(m_audio_ports.at(i+j));
            if(likely(p->buffer && p->enabled)) {
                client_buffers[j] = (float *) p->buffer;
                client_buffers[j] += offset;
            } else {
                // if a port is disabled or has no valid
                // buffer, use the scratch buffer (all zero's)
                client_buffers[j] = (float *) m_scratch_buffer;
            }
        }

        // the base event for this position
        target_event = (quadlet_t *)(data + i);
        // process the events
        for (j=0;j < nevents; j += 1)
        {
            // read the values
            tmp_values[0] = *(client_buffers[0]);
            tmp_values[1] = *(client_buffers[1]);
            tmp_values[2] = *(client_buffers[2]);
            tmp_values[3] = *(client_buffers[3]);

            // now do the SSE based conversion/labeling
            __m128 v_float = *((__m128*)tmp_values);
            __m128i *target = (__m128i*)target_event;
            __m128i v_int;

            // clip
#if AMDTP_CLIP_FLOATS
            // do SSE clipping
            v_float = _mm_max_ps(v_float, v_min);
            v_float = _mm_min_ps(v_float, v_max);
#endif

            // multiply
            v_float = _mm_mul_ps(v_float, mult);
            // convert to signed integer
            v_int = _mm_cvttps_epi32( v_float );
            // mask
            v_int = _mm_and_si128( v_int, mask );
            // label it
            v_int = _mm_or_si128( v_int, label );

            // do endian conversion (SSE is always little endian)
            // do first swap
            v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) );
            // do second swap
            v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) );
            // store the packed int
            // (target misalignment is assumed since we don't know the m_dimension)
            _mm_storeu_si128 (target, v_int);

            // increment the buffer pointers
            client_buffers[0]++;
            client_buffers[1]++;
            client_buffers[2]++; 
            client_buffers[3]++;

            // go to next target event position
            target_event += m_dimension;
        }
    }

    // do remaining ports
    // NOTE: these can be time-SSE'd
    for (; i < (int)m_nb_audio_ports; i++) {
        struct _MBLA_port_cache &p = m_audio_ports.at(i);
        target_event = (quadlet_t *)(data + i);
#ifdef DEBUG
        assert(nevents + offset <= p.buffer_size );
#endif

        if(likely(p.buffer && p.enabled)) {
            float *buffer = (float *)(p.buffer);
            buffer += offset;
    
            for (j = 0;j < nevents; j += 4)
            {
                // read the values
                tmp_values[0] = *buffer;
                buffer++;
                tmp_values[1] = *buffer;
                buffer++;
                tmp_values[2] = *buffer;
                buffer++;
                tmp_values[3] = *buffer;
                buffer++;

                // now do the SSE based conversion/labeling
                __m128 v_float = *((__m128*)tmp_values);
                __m128i v_int;

#if AMDTP_CLIP_FLOATS
                // do SSE clipping
                v_float = _mm_max_ps(v_float, v_min);
                v_float = _mm_min_ps(v_float, v_max);
#endif
                // multiply
                v_float = _mm_mul_ps(v_float, mult);
                // convert to signed integer
                v_int = _mm_cvttps_epi32( v_float );
                // mask
                v_int = _mm_and_si128( v_int, mask );
                // label it
                v_int = _mm_or_si128( v_int, label );
    
                // do endian conversion (SSE is always little endian)
                // do first swap
                v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) );
                // do second swap
                v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) );

                // store the packed int
                _mm_store_si128 ((__m128i *)(&tmp_values_int), v_int);

                // increment the buffer pointers
                *target_event = tmp_values_int[0];
                target_event += m_dimension;
                *target_event = tmp_values_int[1];
                target_event += m_dimension;
                *target_event = tmp_values_int[2];
                target_event += m_dimension;
                *target_event = tmp_values_int[3];
                target_event += m_dimension;
            }

            // do the remainder of the events
            for(;j < nevents; j += 1) {
                float *in = (float *)buffer;
#if AMDTP_CLIP_FLOATS
                // clip directly to the value of a maxed event
                if(unlikely(*in > 1.0)) {
                    *target_event = CONDSWAPTOBUS32_CONST(0x407FFFFF);
                } else if(unlikely(*in < -1.0)) {
                    *target_event = CONDSWAPTOBUS32_CONST(0x40800001);
                } else {
                    float v = (*in) * AMDTP_FLOAT_MULTIPLIER;
                    unsigned int tmp = ((int) v);
                    tmp = ( tmp & 0x00FFFFFF ) | 0x40000000;
                    *target_event = CondSwapToBus32((quadlet_t)tmp);
                }
#else
                float v = (*in) * AMDTP_FLOAT_MULTIPLIER;
                unsigned int tmp = ((int) v);
                tmp = ( tmp & 0x00FFFFFF ) | 0x40000000;
                *target_event = CondSwapToBus32((quadlet_t)tmp);
#endif
                buffer++;
                target_event += m_dimension;
            }

        } else {
            for (j = 0;j < nevents; j += 1)
            {
                // hardcoded byte swapped
                *target_event = 0x00000040;
                target_event += m_dimension;
            }
        }
    }
}

Exemplo n.º 28

0

Exibir arquivo

Arquivo: nv_num_vector.c Projeto: nagadomi/nv-debian

float 
nv_vector_max(const nv_matrix_t *v, int j)
{
	float v_max = -FLT_MAX;
	int i;
#if NV_ENABLE_AVX
	{
		NV_ALIGNED(float, mm[9], 32);
		__m256 max_vec;
		int pk_lp = (v->n & 0xfffffff8);
		
		max_vec = _mm256_set1_ps(-FLT_MAX);
		for (i = 0; i < pk_lp; i += 8) {
			max_vec = _mm256_max_ps(max_vec, *(const __m256 *)&NV_MAT_V(v, j, i));
		}
		_mm256_store_ps(mm, max_vec);
		
		for (i = pk_lp; i < v->n; ++i) {
			if (NV_MAT_V(v, j, i) > v_max) {
				v_max = NV_MAT_V(v, j, i);
			}
		}
		mm[8] = v_max;
		for (i = 0; i < 9; ++i) {
			if (mm[i] > v_max) {
				v_max = mm[i];
			}
		}
	}
#elif NV_ENABLE_SSE2
	{
		NV_ALIGNED(float, mm[5], 16);
		__m128 max_vec;
		int pk_lp = (v->n & 0xfffffffc);
		
		max_vec = _mm_set1_ps(-FLT_MAX);
		for (i = 0; i < pk_lp; i += 4) {
			max_vec = _mm_max_ps(max_vec, *(const __m128 *)&NV_MAT_V(v, j, i));
		}
		_mm_store_ps(mm, max_vec);
		
		for (i = pk_lp; i < v->n; ++i) {
			if (NV_MAT_V(v, j, i) > v_max) {
				v_max = NV_MAT_V(v, j, i);
			}
		}
		mm[4] = v_max;
		for (i = 0; i < 5; ++i) {
			if (mm[i] > v_max) {
				v_max = mm[i];
			}
		}
	}
#else
	for (i = 0; i < v->n; ++i) {
		if (NV_MAT_V(v, j, i) > v_max) {
			v_max = NV_MAT_V(v, j, i);
		}
	}
#endif
	return v_max;
}

Exemplo n.º 29

0

Exibir arquivo

Arquivo: Draw.cpp Projeto: 1170390/OpenNI2

void YUYVToRGB888(const XnUInt8* pYUVImage, XnUInt8* pRGBAImage, XnUInt32 nYUVSize, XnUInt32 nRGBSize)
{
	const XnUInt8* pYUVLast = pYUVImage + nYUVSize - 8;
	XnUInt8* pRGBLast = pRGBAImage + nRGBSize - 16;

	const __m128 minus128 = _mm_set_ps1(-128);
	const __m128 plus113983 = _mm_set_ps1(1.13983F);
	const __m128 minus039466 = _mm_set_ps1(-0.39466F);
	const __m128 minus058060 = _mm_set_ps1(-0.58060F);
	const __m128 plus203211 = _mm_set_ps1(2.03211F);
	const __m128 zero = _mm_set_ps1(0);
	const __m128 plus255 = _mm_set_ps1(255);

	// define YUV floats
	__m128 y;
	__m128 u;
	__m128 v;

	__m128 temp;

	// define RGB floats
	__m128 r;
	__m128 g;
	__m128 b;

	// define RGB integers
	__m128i iR;
	__m128i iG;
	__m128i iB;

	XnUInt32* piR = (XnUInt32*)&iR;
	XnUInt32* piG = (XnUInt32*)&iG;
	XnUInt32* piB = (XnUInt32*)&iB;

	while (pYUVImage <= pYUVLast && pRGBAImage <= pRGBLast)
	{
		// process 4 pixels at once (values should be ordered backwards)
		y = _mm_set_ps(pYUVImage[YUYV_Y2 + YUYV_BPP], pYUVImage[YUYV_Y1 + YUYV_BPP], pYUVImage[YUYV_Y2], pYUVImage[YUYV_Y1]);
		u = _mm_set_ps(pYUVImage[YUYV_U + YUYV_BPP],  pYUVImage[YUYV_U + YUYV_BPP],  pYUVImage[YUYV_U],  pYUVImage[YUYV_U]);
		v = _mm_set_ps(pYUVImage[YUYV_V + YUYV_BPP],  pYUVImage[YUYV_V + YUYV_BPP],  pYUVImage[YUYV_V],  pYUVImage[YUYV_V]);

		u = _mm_add_ps(u, minus128); // u -= 128
		v = _mm_add_ps(v, minus128); // v -= 128

		/*

		http://en.wikipedia.org/wiki/YUV

		From YUV to RGB:
		R =     Y + 1.13983 V
		G =     Y - 0.39466 U - 0.58060 V
		B =     Y + 2.03211 U

		*/ 

		temp = _mm_mul_ps(plus113983, v);
		r = _mm_add_ps(y, temp);

		temp = _mm_mul_ps(minus039466, u);
		g = _mm_add_ps(y, temp);
		temp = _mm_mul_ps(minus058060, v);
		g = _mm_add_ps(g, temp);

		temp = _mm_mul_ps(plus203211, u);
		b = _mm_add_ps(y, temp);

		// make sure no value is smaller than 0
		r = _mm_max_ps(r, zero);
		g = _mm_max_ps(g, zero);
		b = _mm_max_ps(b, zero);

		// make sure no value is bigger than 255
		r = _mm_min_ps(r, plus255);
		g = _mm_min_ps(g, plus255);
		b = _mm_min_ps(b, plus255);

		// convert floats to int16 (there is no conversion to uint8, just to int8).
		iR = _mm_cvtps_epi32(r);
		iG = _mm_cvtps_epi32(g);
		iB = _mm_cvtps_epi32(b);

		// extract the 4 pixels RGB values.
		// because we made sure values are between 0 and 255, we can just take the lower byte
		// of each INT16
		pRGBAImage[0] = piR[0];
		pRGBAImage[1] = piG[0];
		pRGBAImage[2] = piB[0];
		pRGBAImage[3] = 255;

		pRGBAImage[4] = piR[1];
		pRGBAImage[5] = piG[1];
		pRGBAImage[6] = piB[1];
		pRGBAImage[7] = 255;

		pRGBAImage[8] = piR[2];
		pRGBAImage[9] = piG[2];
		pRGBAImage[10] = piB[2];
		pRGBAImage[11] = 255;

		pRGBAImage[12] = piR[3];
		pRGBAImage[13] = piG[3];
		pRGBAImage[14] = piB[3];
		pRGBAImage[15] = 255;

		// advance the streams
		pYUVImage += 8;
		pRGBAImage += 16;
	}
}

Exemplo n.º 30

0

Exibir arquivo

Arquivo: SseMathFuncs.cpp Projeto: Cactuslegs/audacity-of-nope

/* natural logarithm computed for 4 simultaneous float 
return NaN for x <= 0
*/
__m128 log_ps(v4sfu *xPtr) {
   __m128 x=*((__m128 *)xPtr);
#ifdef USE_SSE2
   __m128i emm0;
#else
   __m64 mm0, mm1;
#endif
   __m128 one = *(__m128*)_ps_1;

   __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());

   x = _mm_max_ps(x, *(__m128*)_ps_min_norm_pos);  /* cut off denormalized stuff */

#ifndef USE_SSE2
   /* part 1: x = frexpf(x, &e); */
   COPY_XMM_TO_MM(x, mm0, mm1);
   mm0 = _mm_srli_pi32(mm0, 23);
   mm1 = _mm_srli_pi32(mm1, 23);
#else
   emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
#endif
   /* keep only the fractional part */
   x = _mm_and_ps(x, *(__m128*)_ps_inv_mant_mask);
   x = _mm_or_ps(x, *(__m128*)_ps_0p5);

#ifndef USE_SSE2
   /* now e=mm0:mm1 contain the really base-2 exponent */
   mm0 = _mm_sub_pi32(mm0, *(__m64*)_pi32_0x7f);
   mm1 = _mm_sub_pi32(mm1, *(__m64*)_pi32_0x7f);
   __m128 e = _mm_cvtpi32x2_ps(mm0, mm1);
   _mm_empty(); /* bye bye mmx */
#else
   emm0 = _mm_sub_epi32(emm0, *(__m128i*)_pi32_0x7f);
   __m128 e = _mm_cvtepi32_ps(emm0);
#endif

   e = _mm_add_ps(e, one);

   /* part2: 
   if( x < SQRTHF ) {
   e -= 1;
   x = x + x - 1.0;
   } else { x = x - 1.0; }
   */
   __m128 mask = _mm_cmplt_ps(x, *(__m128*)_ps_cephes_SQRTHF);
   __m128 tmp = _mm_and_ps(x, mask);
   x = _mm_sub_ps(x, one);
   e = _mm_sub_ps(e, _mm_and_ps(one, mask));
   x = _mm_add_ps(x, tmp);


   __m128 z = _mm_mul_ps(x,x);

   __m128 y = *(__m128*)_ps_cephes_log_p0;
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p1);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p2);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p3);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p4);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p5);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p6);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p7);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p8);
   y = _mm_mul_ps(y, x);

   y = _mm_mul_ps(y, z);


   tmp = _mm_mul_ps(e, *(__m128*)_ps_cephes_log_q1);
   y = _mm_add_ps(y, tmp);


   tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
   y = _mm_sub_ps(y, tmp);

   tmp = _mm_mul_ps(e, *(__m128*)_ps_cephes_log_q2);
   x = _mm_add_ps(x, y);
   x = _mm_add_ps(x, tmp);
   x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
   return x;
}