Exemple #1
0
void GSVector4i::operator = (const GSVector4& v)
{
	m = _mm_cvttps_epi32(v);
}
Exemple #2
0
RETi CVT( const __m128 x ) { return _mm_cvttps_epi32(x); }
Exemple #3
0
internal void MixSounds(SoundMixer *mixer, uint32 sampleFrequency, __m128 *realChannel0, __m128 *realChannel1, int32 sampleCount)
{
    i32 sampleCount4 = sampleCount / 4;
    
    // simd globals
    __m128 zero = _mm_set1_ps(0.0f);
    __m128 one  = _mm_set1_ps(1.0f);

    // get number of samples per second
    r32 secondsPerSample = 1.0f / (r32)sampleFrequency;
    
    // NOTE(Joey): clean channels with 0.0f before mixing
    for(int32 sampleIndex = 0; sampleIndex < sampleCount4; ++sampleIndex)
    {
        _mm_store_ps((float*)(realChannel0 + sampleIndex), zero);
        _mm_store_ps((float*)(realChannel1 + sampleIndex), zero);
    }
    
    // NOTE(Joey): loop over all playing sounds and mix into both real floating point channels
    for(PlayingSound **playingSoundPtr = &mixer->FirstPlayingSound;
        *playingSoundPtr;
        )
    {
        PlayingSound *playingSound = *playingSoundPtr;
        bool32 soundFinished = false;
        
        Sound* sound = playingSound->Source;
        if(sound)
        {            
            r32 volume0 = playingSound->CurrentVolume[0];
            r32 volume1 = playingSound->CurrentVolume[1];
            r32 dVolume0 = secondsPerSample*playingSound->dVolume[0];
            r32 dVolume1 = secondsPerSample*playingSound->dVolume[1];
            r32 dSample = playingSound->Pitch;
            
            __m128 *channel0 = realChannel0;
            __m128 *channel1 = realChannel1;            
            __m128 masterVolume4 = _mm_set1_ps(mixer->MasterVolume);
            __m128 volume4_0 = _mm_setr_ps(volume0 + 0.0f*dVolume0,
                                           volume0 + 1.0f*dVolume0,
                                           volume0 + 2.0f*dVolume0,
                                           volume0 + 3.0f*dVolume0);
            __m128 volume4_1 = _mm_setr_ps(volume1 + 0.0f*dVolume1,
                                           volume1 + 1.0f*dVolume1,
                                           volume1 + 2.0f*dVolume1,
                                           volume1 + 3.0f*dVolume1);
            __m128 dVolume4_0 = _mm_set1_ps(4.0f*dVolume0);
            __m128 dVolume4_1 = _mm_set1_ps(4.0f*dVolume1);
            
            Assert(playingSound->SamplesPlayed >= 0);
                        
            // NOTE(Joey): determine the maximum number of samples to mix this frame
            i32 nrSamplesToMix = sampleCount;
            // NOTE(Joey): deltaSampleRates can get negative (likely due to floating point precision)
            // this is accounted for in loop condition.
            i32 deltaSampleRates = (sound->SampleCount - RoundReal32ToInt32(playingSound->SamplesPlayed));
            r32 realSamplesRemainingInSound = (r32)deltaSampleRates / (1.0f*dSample);
            i32 samplesRemainingInSound = RoundReal32ToInt32(realSamplesRemainingInSound);
            if(!playingSound->Loop && samplesRemainingInSound <= (i32)nrSamplesToMix)
            {
                nrSamplesToMix = samplesRemainingInSound;
            }
            
            
            
            // NOTE(Joey): determine if we need to break out of the loop early due to volume
            // attenuation reaching 0.0f volume per channel.
            bool32 volumeEnded[2] = {};
            // TODO(Joey): make logic independent of nummer of channels
            // NOTE(Joey): could have a bug here where the volumeEnded (should) get triggered
            // for both volumes at the same time; which due to this logic will only work on one
            // volume item. I don't think this is a problem, but if so - this is likely it.
            if(dVolume0 != 0.0f)
            {
                r32 deltaVolume = playingSound->TargetVolume[0] - playingSound->CurrentVolume[0];
                i32 volumeSampleCount = (i32)((deltaVolume / (1.0f*dVolume0)) + 0.5f);
                if(volumeSampleCount <= nrSamplesToMix)
                {
                    nrSamplesToMix = volumeSampleCount;
                    volumeEnded[0] = true;
                }
            }
            if(dVolume1 != 0.0f)
            {
                r32 deltaVolume = playingSound->TargetVolume[1] - playingSound->CurrentVolume[1];
                i32 volumeSampleCount = (u32)((deltaVolume / (1.0f*dVolume1)) + 0.5f);
                if(volumeSampleCount <= nrSamplesToMix)
                {
                    nrSamplesToMix = volumeSampleCount;
                    volumeEnded[1] = true;
                }
            }
            
            // NOTE(Joey): we get into float precision issues; take expected begin/end sample 
            // position and set playingSound->SamplePosition to expected end sample position.
            // then get next sample position in the loop loop index and start sample position.
            r32 beginSamplePosition = playingSound->SamplesPlayed;
            r32 endSamplePosition = beginSamplePosition + nrSamplesToMix*dSample;
            r32 loopIndexC = (endSamplePosition - beginSamplePosition) / (r32)nrSamplesToMix;
            // r32 samplePosition = playingSound->SamplesPlayed;
            // NOTE(Joey): clamp nrSamplesToMix loop condition to SIMD width of 4
            for(i32 i = 0; i < nrSamplesToMix - (nrSamplesToMix & 3); i += 4)
            {                   
                r32 samplePosition = beginSamplePosition + loopIndexC*(r32)i;
#if 0 // linear filtering // NOTE(Joey): disabled for now as it doesn't seem to make an 'audible' difference
                __m128 samplePos = _mm_setr_ps(samplePosition + 0.0f*dSample,
                                               samplePosition + 1.0f*dSample,
                                               samplePosition + 2.0f*dSample,
                                               samplePosition + 3.0f*dSample);
                __m128i sampleIndex = _mm_cvttps_epi32(samplePos);
                __m128 frac = _mm_sub_ps(samplePos, _mm_cvtepi32_ps(sampleIndex));                     
                
                __m128 sampleValue0 = _mm_setr_ps(sound->Samples[0][((int32*)&sampleIndex)[0] % sound->SampleCount],
                                                  sound->Samples[0][((int32*)&sampleIndex)[1] % sound->SampleCount],
                                                  sound->Samples[0][((int32*)&sampleIndex)[2] % sound->SampleCount],
                                                  sound->Samples[0][((int32*)&sampleIndex)[3] % sound->SampleCount]);
                __m128 sampleValue1 = _mm_setr_ps(sound->Samples[0][(((int32*)&sampleIndex)[0] + 1) % sound->SampleCount],
                                                  sound->Samples[0][(((int32*)&sampleIndex)[1] + 1) % sound->SampleCount],
                                                  sound->Samples[0][(((int32*)&sampleIndex)[2] + 1) % sound->SampleCount],
                                                  sound->Samples[0][(((int32*)&sampleIndex)[3] + 1) % sound->SampleCount]);
                                                  
                __m128 sampleValue = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(one, frac), sampleValue0), _mm_mul_ps(frac, sampleValue1));
#else // nearest-neighbor filtering
                __m128i sampleIndex = _mm_setr_epi32(RoundReal32ToInt32(samplePosition + 0.0f*dSample) % sound->SampleCount,
                                                     RoundReal32ToInt32(samplePosition + 1.0f*dSample) % sound->SampleCount,
                                                     RoundReal32ToInt32(samplePosition + 2.0f*dSample) % sound->SampleCount,
                                                     RoundReal32ToInt32(samplePosition + 3.0f*dSample) % sound->SampleCount);

                __m128 sampleValue = _mm_setr_ps(sound->Samples[0][((i32*)&sampleIndex)[0]],
                                                 sound->Samples[0][((i32*)&sampleIndex)[1]],
                                                 sound->Samples[0][((i32*)&sampleIndex)[2]],
                                                 sound->Samples[0][((i32*)&sampleIndex)[3]]);
#endif 
                
                // NOTE(Joey): write 4 SIMD wide 
                __m128 d0 = _mm_load_ps((float*)&channel0[0]);
                __m128 d1 = _mm_load_ps((float*)&channel1[0]); 
                
                d0 = _mm_add_ps(d0, _mm_mul_ps(_mm_mul_ps(masterVolume4, volume4_0), sampleValue));
                d1 = _mm_add_ps(d1, _mm_mul_ps(_mm_mul_ps(masterVolume4, volume4_1), sampleValue));
                                
                _mm_store_ps((float*)&channel0[0], d0);
                _mm_store_ps((float*)&channel1[0], d1);
                
                ++channel0;
                ++channel1;
              
                volume4_0 = _mm_add_ps(volume4_0, dVolume4_0);
                volume4_1 = _mm_add_ps(volume4_1, dVolume4_1);
                             
                // samplePosition += 4.0f*dSample;
            }
            
            // playingSound->SamplesPlayed = samplePosition;
            playingSound->SamplesPlayed = endSamplePosition;
            playingSound->CurrentVolume[0] = ((real32*)&volume4_0)[0];
            playingSound->CurrentVolume[1] = ((real32*)&volume4_1)[0];
            
            // NOTE(Joey): if volume 0.0f is reached due to attenuation, reset delta volume
            for(int32 i = 0; i < ArrayCount(volumeEnded); ++i)
            {
                if(volumeEnded[i])
                {
                    playingSound->CurrentVolume[i] = playingSound->TargetVolume[i];
                    playingSound->dVolume[i] = 0.0f;
                }
            }
            
            // if loop, re-position SamplesPlayed to start of sound sample0
            if(playingSound->Loop && playingSound->SamplesPlayed >= (r32)sound->SampleCount)
               playingSound->SamplesPlayed = playingSound->SamplesPlayed - (r32)sound->SampleCount;
          
            
            soundFinished = !playingSound->Loop && (uint32)playingSound->SamplesPlayed >= sound->SampleCount; 
            if(soundFinished)
                dSample = 0.0f;
        }
        else
        {
            // NOTE(Joey): Load sound here? or when retrieving from Asset manager; I'd say load sound when
            // not available in asset manager, much better path to take
        }
        
        if(soundFinished)
        {
            *playingSoundPtr = playingSound->Next;
            playingSound->Next = mixer->FirstFreePlayingSound;
            mixer->FirstFreePlayingSound = playingSound;
        }
        else
        {
            playingSoundPtr = &playingSound->Next;
        }
    }
}
void sincos_ps(__m128 x, __m128 *s, __m128 *c) {
  __m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
  __m128i emm0, emm2, emm4;
  sign_bit_sin = x;
  x = _mm_and_ps(x, *reinterpret_cast<const __m128*>(_pi_inv_sign_mask));
  sign_bit_sin = _mm_and_ps(sign_bit_sin,
                            *reinterpret_cast<const __m128*>(_pi_sign_mask));
  y = _mm_mul_ps(x, *_ps_cephes_FOPI);
  emm2 = _mm_cvttps_epi32(y);
  emm2 = _mm_add_epi32(emm2, *_pi_1);
  emm2 = _mm_and_si128(emm2, *_pi_inv1);
  y = _mm_cvtepi32_ps(emm2);
  emm4 = emm2;
  emm0 = _mm_and_si128(emm2, *_pi_4);
  emm0 = _mm_slli_epi32(emm0, 29);
  __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
  emm2 = _mm_and_si128(emm2, *_pi_2);
  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
  __m128 poly_mask = _mm_castsi128_ps(emm2);
  xmm1 = *_ps_minus_cephes_DP1;
  xmm2 = *_ps_minus_cephes_DP2;
  xmm3 = *_ps_minus_cephes_DP3;
  xmm1 = _mm_mul_ps(y, xmm1);
  xmm2 = _mm_mul_ps(y, xmm2);
  xmm3 = _mm_mul_ps(y, xmm3);
  x = _mm_add_ps(x, xmm1);
  x = _mm_add_ps(x, xmm2);
  x = _mm_add_ps(x, xmm3);
  emm4 = _mm_sub_epi32(emm4, *_pi_2);
  emm4 = _mm_andnot_si128(emm4, *_pi_4);
  emm4 = _mm_slli_epi32(emm4, 29);
  __m128 sign_bit_cos = _mm_castsi128_ps(emm4);
  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
  __m128 z = _mm_mul_ps(x, x);
  y = *_ps_coscof_p0;
  y = _mm_mul_ps(y, z);
  y = _mm_add_ps(y, *_ps_coscof_p1);
  y = _mm_mul_ps(y, z);
  y = _mm_add_ps(y, *_ps_coscof_p2);
  y = _mm_mul_ps(y, z);
  y = _mm_mul_ps(y, z);
  __m128 tmp = _mm_mul_ps(z, *_ps_0p5);
  y = _mm_sub_ps(y, tmp);
  y = _mm_add_ps(y, *_ps_1);
  __m128 y2 = *_ps_sincof_p0;
  y2 = _mm_mul_ps(y2, z);
  y2 = _mm_add_ps(y2, *_ps_sincof_p1);
  y2 = _mm_mul_ps(y2, z);
  y2 = _mm_add_ps(y2, *_ps_sincof_p2);
  y2 = _mm_mul_ps(y2, z);
  y2 = _mm_mul_ps(y2, x);
  y2 = _mm_add_ps(y2, x);
  xmm3 = poly_mask;
  __m128 ysin2 = _mm_and_ps(xmm3, y2);
  __m128 ysin1 = _mm_andnot_ps(xmm3, y);
  y2 = _mm_sub_ps(y2, ysin2);
  y = _mm_sub_ps(y, ysin1);
  xmm1 = _mm_add_ps(ysin1, ysin2);
  xmm2 = _mm_add_ps(y, y2);
  *s = _mm_xor_ps(xmm1, sign_bit_sin);
  *c = _mm_xor_ps(xmm2, sign_bit_cos);
}
Exemple #5
0
/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
   it is almost as fast, and gives you a free cosine with your sine */
void sincos_ps(__m128 x, __m128* s, __m128* c) {
    typedef __m128 v4sf;
    typedef __m128i v4si;

    v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
    v4si emm0, emm2, emm4;
    sign_bit_sin = x;
    /* take the absolute value */
    x = _mm_and_ps(x, constants::inv_sign_mask.ps);
    /* extract the sign bit (upper one) */
    sign_bit_sin = _mm_and_ps(sign_bit_sin, constants::sign_mask.ps);

    /* scale by 4/Pi */
    y = _mm_mul_ps(x, constants::cephes_FOPI.ps);

    /* store the integer part of y in emm2 */
    emm2 = _mm_cvttps_epi32(y);

    /* j=(j+1) & (~1) (see the cephes sources) */
    emm2 = _mm_add_epi32(emm2, constants::pi32_1.pi);
    emm2 = _mm_and_si128(emm2, constants::pi32_inv1.pi);
    y = _mm_cvtepi32_ps(emm2);

    emm4 = emm2;

    /* get the swap sign flag for the sine */
    emm0 = _mm_and_si128(emm2, constants::pi32_4.pi);
    emm0 = _mm_slli_epi32(emm0, 29);
    v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);

    /* get the polynom selection mask for the sine*/
    emm2 = _mm_and_si128(emm2, constants::pi32_2.pi);
    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
    v4sf poly_mask = _mm_castsi128_ps(emm2);

    /* The magic pass: "******"
    x = ((x - y * DP1) - y * DP2) - y * DP3; */
    xmm1 = constants::minus_cephes_DP1.ps;
    xmm2 = constants::minus_cephes_DP2.ps;
    xmm3 = constants::minus_cephes_DP3.ps;
    xmm1 = _mm_mul_ps(y, xmm1);
    xmm2 = _mm_mul_ps(y, xmm2);
    xmm3 = _mm_mul_ps(y, xmm3);
    x = _mm_add_ps(x, xmm1);
    x = _mm_add_ps(x, xmm2);
    x = _mm_add_ps(x, xmm3);

    emm4 = _mm_sub_epi32(emm4, constants::pi32_2.pi);
    emm4 = _mm_andnot_si128(emm4, constants::pi32_4.pi);
    emm4 = _mm_slli_epi32(emm4, 29);
    v4sf sign_bit_cos = _mm_castsi128_ps(emm4);

    sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);


    /* Evaluate the first polynom  (0 <= x <= Pi/4) */
    v4sf z = _mm_mul_ps(x,x);
    y = constants::coscof_p0.ps;

    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, constants::coscof_p1.ps);
    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, constants::coscof_p2.ps);
    y = _mm_mul_ps(y, z);
    y = _mm_mul_ps(y, z);
    v4sf tmp = _mm_mul_ps(z, constants::ps_0p5.ps);
    y = _mm_sub_ps(y, tmp);
    y = _mm_add_ps(y, constants::ps_1.ps);

    /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

    v4sf y2 = constants::sincof_p0.ps;
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_add_ps(y2, constants::sincof_p1.ps);
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_add_ps(y2, constants::sincof_p2.ps);
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_mul_ps(y2, x);
    y2 = _mm_add_ps(y2, x);

    /* select the correct result from the two polynoms */
    xmm3 = poly_mask;
    v4sf ysin2 = _mm_and_ps(xmm3, y2);
    v4sf ysin1 = _mm_andnot_ps(xmm3, y);
    y2 = _mm_sub_ps(y2,ysin2);
    y = _mm_sub_ps(y, ysin1);

    xmm1 = _mm_add_ps(ysin1,ysin2);
    xmm2 = _mm_add_ps(y,y2);

    /* update the sign */
    *s = _mm_xor_ps(xmm1, sign_bit_sin);
    *c = _mm_xor_ps(xmm2, sign_bit_cos);
}
Exemple #6
0
std::unique_ptr<Occluder> Occluder::bake(const std::vector<__m128>& vertices, __m128 refMin, __m128 refMax)
{
  assert(vertices.size() % 16 == 0);

  // Simple k-means clustering by normal direction to improve backface culling efficiency
  std::vector<__m128> quadNormals;
  for (auto i = 0; i < vertices.size(); i += 4)
  {
    auto v0 = vertices[i + 0];
    auto v1 = vertices[i + 1];
    auto v2 = vertices[i + 2];
    auto v3 = vertices[i + 3];

    quadNormals.push_back(normalize(_mm_add_ps(normal(v0, v1, v2), normal(v0, v2, v3))));
  }

  std::vector<__m128> centroids;
  std::vector<uint32_t> centroidAssignment;
  centroids.push_back(_mm_setr_ps(+1.0f, 0.0f, 0.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(0.0f, +1.0f, 0.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(0.0f, 0.0f, +1.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(0.0f, -1.0f, 0.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(0.0f, 0.0f, -1.0f, 0.0f));
  centroids.push_back(_mm_setr_ps(-1.0f, 0.0f, 0.0f, 0.0f));

  centroidAssignment.resize(vertices.size() / 4);

  bool anyChanged = true;
  for (int iter = 0; iter < 10 && anyChanged; ++iter)
  {
    anyChanged = false;

    for (auto j = 0; j < quadNormals.size(); ++j)
    {
      __m128 normal = quadNormals[j];

      __m128 bestDistance = _mm_set1_ps(-std::numeric_limits<float>::infinity());
      int bestCentroid = -1;
      for (int k = 0; k < centroids.size(); ++k)
      {
        __m128 distance = _mm_dp_ps(centroids[k], normal, 0x7F);
        if (_mm_comige_ss(distance, bestDistance))
        {
          bestDistance = distance;
          bestCentroid = k;
        }
      }

      if (centroidAssignment[j] != bestCentroid)
      {
        centroidAssignment[j] = bestCentroid;
        anyChanged = true;
      }
    }

    for (int k = 0; k < centroids.size(); ++k)
    {
      centroids[k] = _mm_setzero_ps();
    }

    for (int j = 0; j < quadNormals.size(); ++j)
    {
      int k = centroidAssignment[j];

      centroids[k] = _mm_add_ps(centroids[k], quadNormals[j]);
    }

    for (int k = 0; k < centroids.size(); ++k)
    {
      centroids[k] = normalize(centroids[k]);
    }
  }

  std::vector<__m128> orderedVertices;
  for (int k = 0; k < centroids.size(); ++k)
  {
    for (int j = 0; j < vertices.size() / 4; ++j)
    {
      if (centroidAssignment[j] == k)
      {
        orderedVertices.push_back(vertices[4 * j + 0]);
        orderedVertices.push_back(vertices[4 * j + 1]);
        orderedVertices.push_back(vertices[4 * j + 2]);
        orderedVertices.push_back(vertices[4 * j + 3]);
      }
    }
  }

  auto occluder = std::make_unique<Occluder>();

  __m128 invExtents = _mm_div_ps(_mm_set1_ps(1.0f), _mm_sub_ps(refMax, refMin));

  __m128 scalingX = _mm_set1_ps(2047.0f);
  __m128 scalingY = _mm_set1_ps(2047.0f);
  __m128 scalingZ = _mm_set1_ps(1023.0f);

  __m128 half = _mm_set1_ps(0.5f);

  for (size_t i = 0; i < orderedVertices.size(); i += 16)
  {
    for (auto j = 0; j < 4; ++j)
    {
      // Transform into [0,1] space relative to bounding box
      __m128 v0 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 0], refMin), invExtents);
      __m128 v1 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 4], refMin), invExtents);
      __m128 v2 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 8], refMin), invExtents);
      __m128 v3 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 12], refMin), invExtents);

      // Transpose into [xxxx][yyyy][zzzz][wwww]
      _MM_TRANSPOSE4_PS(v0, v1, v2, v3);

      // Scale and truncate to int
      v0 = _mm_fmadd_ps(v0, scalingX, half);
      v1 = _mm_fmadd_ps(v1, scalingY, half);
      v2 = _mm_fmadd_ps(v2, scalingZ, half);

      __m128i X = _mm_cvttps_epi32(v0);
      __m128i Y = _mm_cvttps_epi32(v1);
      __m128i Z = _mm_cvttps_epi32(v2);

      // Pack to 11/11/10 format
      __m128i XYZ = _mm_or_si128(_mm_slli_epi32(X, 21), _mm_or_si128(_mm_slli_epi32(Y, 10), Z));

      occluder->m_vertexData.push_back(XYZ);
    }
  }

  occluder->m_refMin = refMin;
  occluder->m_refMax = refMax;

  __m128 min = _mm_set1_ps(+std::numeric_limits<float>::infinity());
  __m128 max = _mm_set1_ps(-std::numeric_limits<float>::infinity());

  for (size_t i = 0; i < orderedVertices.size(); ++i)
  {
    min = _mm_min_ps(vertices[i], min);
    max = _mm_max_ps(vertices[i], max);
  }

  // Set W = 1 - this is expected by frustum culling code
  min = _mm_blend_ps(min, _mm_set1_ps(1.0f), 0b1000);
  max = _mm_blend_ps(max, _mm_set1_ps(1.0f), 0b1000);

  occluder->m_boundsMin = min;
  occluder->m_boundsMax = max;

  occluder->m_center = _mm_mul_ps(_mm_add_ps(max, min), _mm_set1_ps(0.5f));

  return occluder;
}
/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
it is almost as fast, and gives you a free cosine with your sine */
void sincos_ps(v4sfu *xptr, v4sfu *sptr, v4sfu *cptr) {
   __m128 x=*((__m128 *)xptr), *s=(__m128 *)sptr, *c=(__m128 *)cptr, xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
#ifdef USE_SSE2
   __m128i emm0, emm2, emm4;
#else
   __m64 mm0, mm1, mm2, mm3, mm4, mm5;
#endif
   sign_bit_sin = x;
   /* take the absolute value */
   x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);
   /* extract the sign bit (upper one) */
   sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask);

   /* scale by 4/Pi */
   y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI);

#ifdef USE_SSE2
   /* store the integer part of y in emm2 */
   emm2 = _mm_cvttps_epi32(y);

   /* j=(j+1) & (~1) (see the cephes sources) */
   emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1);
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1);
   y = _mm_cvtepi32_ps(emm2);

   emm4 = emm2;

   /* get the swap sign flag for the sine */
   emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4);
   emm0 = _mm_slli_epi32(emm0, 29);
   __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);

   /* get the polynom selection mask for the sine*/
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2);
   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
   __m128 poly_mask = _mm_castsi128_ps(emm2);
#else
   /* store the integer part of y in mm2:mm3 */
   xmm3 = _mm_movehl_ps(xmm3, y);
   mm2 = _mm_cvttps_pi32(y);
   mm3 = _mm_cvttps_pi32(xmm3);

   /* j=(j+1) & (~1) (see the cephes sources) */
   mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1);
   mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1);
   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1);

   y = _mm_cvtpi32x2_ps(mm2, mm3);

   mm4 = mm2;
   mm5 = mm3;

   /* get the swap sign flag for the sine */
   mm0 = _mm_and_si64(mm2, *(__m64*)_pi32_4);
   mm1 = _mm_and_si64(mm3, *(__m64*)_pi32_4);
   mm0 = _mm_slli_pi32(mm0, 29);
   mm1 = _mm_slli_pi32(mm1, 29);
   __m128 swap_sign_bit_sin;
   COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);

   /* get the polynom selection mask for the sine */

   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2);
   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
   __m128 poly_mask;
   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
#endif

   /* The magic pass: "******" 
   x = ((x - y * DP1) - y * DP2) - y * DP3; */
   xmm1 = *(__m128*)_ps_minus_cephes_DP1;
   xmm2 = *(__m128*)_ps_minus_cephes_DP2;
   xmm3 = *(__m128*)_ps_minus_cephes_DP3;
   xmm1 = _mm_mul_ps(y, xmm1);
   xmm2 = _mm_mul_ps(y, xmm2);
   xmm3 = _mm_mul_ps(y, xmm3);
   x = _mm_add_ps(x, xmm1);
   x = _mm_add_ps(x, xmm2);
   x = _mm_add_ps(x, xmm3);

#ifdef USE_SSE2
   emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2);
   emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4);
   emm4 = _mm_slli_epi32(emm4, 29);
   __m128 sign_bit_cos = _mm_castsi128_ps(emm4);
#else
   /* get the sign flag for the cosine */
   mm4 = _mm_sub_pi32(mm4, *(__m64*)_pi32_2);
   mm5 = _mm_sub_pi32(mm5, *(__m64*)_pi32_2);
   mm4 = _mm_andnot_si64(mm4, *(__m64*)_pi32_4);
   mm5 = _mm_andnot_si64(mm5, *(__m64*)_pi32_4);
   mm4 = _mm_slli_pi32(mm4, 29);
   mm5 = _mm_slli_pi32(mm5, 29);
   __m128 sign_bit_cos;
   COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
   _mm_empty(); /* good-bye mmx */
#endif

   sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);


   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
   __m128 z = _mm_mul_ps(x,x);
   y = *(__m128*)_ps_coscof_p0;

   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1);
   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2);
   y = _mm_mul_ps(y, z);
   y = _mm_mul_ps(y, z);
   __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
   y = _mm_sub_ps(y, tmp);
   y = _mm_add_ps(y, *(__m128*)_ps_1);

   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

   __m128 y2 = *(__m128*)_ps_sincof_p0;
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_mul_ps(y2, x);
   y2 = _mm_add_ps(y2, x);

   /* select the correct result from the two polynoms */  
   xmm3 = poly_mask;
   __m128 ysin2 = _mm_and_ps(xmm3, y2);
   __m128 ysin1 = _mm_andnot_ps(xmm3, y);
   y2 = _mm_sub_ps(y2,ysin2);
   y = _mm_sub_ps(y, ysin1);

   xmm1 = _mm_add_ps(ysin1,ysin2);
   xmm2 = _mm_add_ps(y,y2);

   /* update the sign */
   *s = _mm_xor_ps(xmm1, sign_bit_sin);
   *c = _mm_xor_ps(xmm2, sign_bit_cos);
}
static void GF_FUNC_ALIGN VS_CC
proc_8bit_sse2(convolution_hv_t *ch, uint8_t *buff, int bstride, int width,
               int height, int stride, uint8_t *dstp, const uint8_t *srcp)
{
    uint8_t *p0 = buff + 16;
    uint8_t *p1 = p0 + bstride;
    uint8_t *p2 = p1 + bstride;
    uint8_t *p3 = p2 + bstride;
    uint8_t *p4 = p3 + bstride;
    uint8_t *orig = p0, *end = p4;

    line_copy8(p0, srcp + 2 * stride, width, 2);
    line_copy8(p1, srcp + stride, width, 2);
    line_copy8(p2, srcp, width, 2);
    srcp += stride;
    line_copy8(p3, srcp, width, 2);

    __m128i zero = _mm_setzero_si128();
    __m128i all1 = _mm_cmpeq_epi32(zero, zero);
    __m128i one = _mm_srli_epi16(all1, 15);
    __m128 rdiv_h = _mm_set1_ps((float)ch->rdiv_h);
    __m128 rdiv_v = _mm_set1_ps((float)ch->rdiv_v);
    __m128 bias = _mm_set1_ps((float)ch->bias);
    
    __m128i matrix_h[5];
    __m128i matrix_v[5];
    for (int i = 0; i < 5; i++) {
        matrix_h[i] = _mm_unpacklo_epi16(_mm_set1_epi16((int16_t)ch->m_h[i]), zero);
        matrix_v[i] = _mm_unpacklo_epi16(_mm_set1_epi16((int16_t)ch->m_v[i]), zero);
    }

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 2 ? 1 : -1);
        line_copy8(p4, srcp, width, 2);

        for (int x = 0; x < width; x += 16) {
            uint8_t *array[] = {
                p0 + x, p1 + x, p2 + x, p3 + x, p4 + x,
                p2 + x - 2, p2 + x - 1, dstp + x, p2 + x + 1, p2 + x + 2
            };

            for (int j = 0; j < 2; j++) {
                __m128i *matrix = j == 0 ? matrix_v : matrix_h;
                __m128i sum[4];
                sum[0] = _mm_setzero_si128();
                sum[1] = _mm_setzero_si128();
                sum[2] = _mm_setzero_si128();
                sum[3] = _mm_setzero_si128();

                for (int i = 0; i < 5; i++) {
                    __m128i xmm0, xmm1, xmm2;

                    xmm0 = _mm_loadu_si128((__m128i *)array[i + j * 5]);
                    xmm2 = _mm_unpackhi_epi8(xmm0, zero);
                    xmm0 = _mm_unpacklo_epi8(xmm0, zero);

                    xmm1 = _mm_unpackhi_epi16(xmm0, zero);
                    xmm0 = _mm_unpacklo_epi16(xmm0, zero);
                    sum[0] = _mm_add_epi32(sum[0], _mm_madd_epi16(xmm0, matrix[i]));
                    sum[1] = _mm_add_epi32(sum[1], _mm_madd_epi16(xmm1, matrix[i]));

                    xmm1 = _mm_unpackhi_epi16(xmm2, zero);
                    xmm0 = _mm_unpacklo_epi16(xmm2, zero);
                    sum[2] = _mm_add_epi32(sum[2], _mm_madd_epi16(xmm0, matrix[i]));
                    sum[3] = _mm_add_epi32(sum[3], _mm_madd_epi16(xmm1, matrix[i]));
                }

                for (int i = 0; i < 4; i++) {
                    __m128 sumfp = _mm_cvtepi32_ps(sum[i]);
                    sumfp = _mm_mul_ps(sumfp, j == 0 ? rdiv_v : rdiv_h);
                    if (j == 1) {
                        sumfp = _mm_add_ps(sumfp, bias);
                    }
                    sum[i] = _mm_cvttps_epi32(sumfp);
                }

                sum[0] = _mm_packs_epi32(sum[0], sum[1]);
                sum[1] = _mm_packs_epi32(sum[2], sum[3]);

                if (!ch->saturate) {
                    for (int i = 0; i < 2; i++) {
                        __m128i mask = _mm_cmplt_epi16(sum[i], zero);
                        __m128i temp = _mm_add_epi16(one, _mm_xor_si128(sum[i], all1));
                        temp = _mm_and_si128(temp, mask);
                        sum[i] = _mm_andnot_si128(mask, sum[i]);
                        sum[i] = _mm_or_si128(sum[i], temp);
                    }
                }

                sum[0] = _mm_packus_epi16(sum[0], sum[1]);

                _mm_store_si128((__m128i *)(dstp + x), sum[0]);
            }
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = (p4 == end) ? orig : p4 + bstride;
    }
}
__m128 exp_ps(v4sfu *xPtr) {
   __m128 x=*((__m128 *)xPtr);
   __m128 tmp = _mm_setzero_ps(), fx;
#ifdef USE_SSE2
   __m128i emm0;
#else
   __m64 mm0, mm1;
#endif
   __m128 one = *(__m128*)_ps_1;

   x = _mm_min_ps(x, *(__m128*)_ps_exp_hi);
   x = _mm_max_ps(x, *(__m128*)_ps_exp_lo);

   /* express exp(x) as exp(g + n*log(2)) */
   fx = _mm_mul_ps(x, *(__m128*)_ps_cephes_LOG2EF);
   fx = _mm_add_ps(fx, *(__m128*)_ps_0p5);

   /* how to perform a floorf with SSE: just below */
#ifndef USE_SSE2
   /* step 1 : cast to int */
   tmp = _mm_movehl_ps(tmp, fx);
   mm0 = _mm_cvttps_pi32(fx);
   mm1 = _mm_cvttps_pi32(tmp);
   /* step 2 : cast back to float */
   tmp = _mm_cvtpi32x2_ps(mm0, mm1);
#else
   emm0 = _mm_cvttps_epi32(fx);
   tmp  = _mm_cvtepi32_ps(emm0);
#endif
   /* if greater, substract 1 */
   __m128 mask = _mm_cmpgt_ps(tmp, fx);    
   mask = _mm_and_ps(mask, one);
   fx = _mm_sub_ps(tmp, mask);

   tmp = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C1);
   __m128 z = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C2);
   x = _mm_sub_ps(x, tmp);
   x = _mm_sub_ps(x, z);

   z = _mm_mul_ps(x,x);

   __m128 y = *(__m128*)_ps_cephes_exp_p0;
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p1);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p2);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p3);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p4);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p5);
   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, x);
   y = _mm_add_ps(y, one);

   /* build 2^n */
#ifndef USE_SSE2
   z = _mm_movehl_ps(z, fx);
   mm0 = _mm_cvttps_pi32(fx);
   mm1 = _mm_cvttps_pi32(z);
   mm0 = _mm_add_pi32(mm0, *(__m64*)_pi32_0x7f);
   mm1 = _mm_add_pi32(mm1, *(__m64*)_pi32_0x7f);
   mm0 = _mm_slli_pi32(mm0, 23); 
   mm1 = _mm_slli_pi32(mm1, 23);

   __m128 pow2n; 
   COPY_MM_TO_XMM(mm0, mm1, pow2n);
   _mm_empty();
#else
   emm0 = _mm_cvttps_epi32(fx);
   emm0 = _mm_add_epi32(emm0, *(__m128i*)_pi32_0x7f);
   emm0 = _mm_slli_epi32(emm0, 23);
   __m128 pow2n = _mm_castsi128_ps(emm0);
#endif
   y = _mm_mul_ps(y, pow2n);
   return y;
}
/* almost the same as sin_ps */
__m128 cos_ps(v4sfu *xPtr) { // any x
   __m128 x=*((__m128 *)xPtr);
   __m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
#ifdef USE_SSE2
   __m128i emm0, emm2;
#else
   __m64 mm0, mm1, mm2, mm3;
#endif
   /* take the absolute value */
   x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);

   /* scale by 4/Pi */
   y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI);

#ifdef USE_SSE2
   /* store the integer part of y in mm0 */
   emm2 = _mm_cvttps_epi32(y);
   /* j=(j+1) & (~1) (see the cephes sources) */
   emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1);
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1);
   y = _mm_cvtepi32_ps(emm2);

   emm2 = _mm_sub_epi32(emm2, *(__m128i*)_pi32_2);

   /* get the swap sign flag */
   emm0 = _mm_andnot_si128(emm2, *(__m128i*)_pi32_4);
   emm0 = _mm_slli_epi32(emm0, 29);
   /* get the polynom selection mask */
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2);
   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());

   __m128 sign_bit = _mm_castsi128_ps(emm0);
   __m128 poly_mask = _mm_castsi128_ps(emm2);
#else
   /* store the integer part of y in mm0:mm1 */
   xmm2 = _mm_movehl_ps(xmm2, y);
   mm2 = _mm_cvttps_pi32(y);
   mm3 = _mm_cvttps_pi32(xmm2);

   /* j=(j+1) & (~1) (see the cephes sources) */
   mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1);
   mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1);
   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1);

   y = _mm_cvtpi32x2_ps(mm2, mm3);


   mm2 = _mm_sub_pi32(mm2, *(__m64*)_pi32_2);
   mm3 = _mm_sub_pi32(mm3, *(__m64*)_pi32_2);

   /* get the swap sign flag in mm0:mm1 and the 
   polynom selection mask in mm2:mm3 */

   mm0 = _mm_andnot_si64(mm2, *(__m64*)_pi32_4);
   mm1 = _mm_andnot_si64(mm3, *(__m64*)_pi32_4);
   mm0 = _mm_slli_pi32(mm0, 29);
   mm1 = _mm_slli_pi32(mm1, 29);

   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2);

   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());

   __m128 sign_bit, poly_mask;
   COPY_MM_TO_XMM(mm0, mm1, sign_bit);
   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
   _mm_empty(); /* good-bye mmx */
#endif
   /* The magic pass: "******" 
   x = ((x - y * DP1) - y * DP2) - y * DP3; */
   xmm1 = *(__m128*)_ps_minus_cephes_DP1;
   xmm2 = *(__m128*)_ps_minus_cephes_DP2;
   xmm3 = *(__m128*)_ps_minus_cephes_DP3;
   xmm1 = _mm_mul_ps(y, xmm1);
   xmm2 = _mm_mul_ps(y, xmm2);
   xmm3 = _mm_mul_ps(y, xmm3);
   x = _mm_add_ps(x, xmm1);
   x = _mm_add_ps(x, xmm2);
   x = _mm_add_ps(x, xmm3);

   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
   y = *(__m128*)_ps_coscof_p0;
   __m128 z = _mm_mul_ps(x,x);

   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1);
   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2);
   y = _mm_mul_ps(y, z);
   y = _mm_mul_ps(y, z);
   __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
   y = _mm_sub_ps(y, tmp);
   y = _mm_add_ps(y, *(__m128*)_ps_1);

   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

   __m128 y2 = *(__m128*)_ps_sincof_p0;
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_mul_ps(y2, x);
   y2 = _mm_add_ps(y2, x);

   /* select the correct result from the two polynoms */  
   xmm3 = poly_mask;
   y2 = _mm_and_ps(xmm3, y2); //, xmm3);
   y = _mm_andnot_ps(xmm3, y);
   y = _mm_add_ps(y,y2);
   /* update the sign */
   y = _mm_xor_ps(y, sign_bit);

   return y;
}
Exemple #11
0
/* Function:  esl_sse_expf()
 * Synopsis:  <r[z] = exp x[z]>
 * Incept:    SRE, Fri Dec 14 14:46:27 2007 [Janelia]
 *
 * Purpose:   Given a vector <x> containing four floats, returns a
 *            vector <r> in which each element <r[z] = expf(x[z])>.
 *
 *            Valid for all IEEE754 floats $x_z$.
 *
 * Xref:      J2/71
 *            J10/62: bugfix, minlogf/maxlogf range was too wide;
 *                    (k+127) must be >=0 and <=255, so (k+127)<<23
 *                    is a valid IEEE754 float, without touching
 *                    the sign bit. Pommier had this right in the
 *                    first place, and I didn't understand.
 *
 * Note:      Derived from an SSE1 implementation by Julian
 *            Pommier. Converted to SSE2.
 *
 *            Note on maxlogf/minlogf, which are close to but not
 *            exactly 127.5/log2 [J10/63]. We need -127<=k<=128, so
 *            k+127 is 0..255, a valid IEEE754 8-bit exponent
 *            (0..255), so the bit pattern (k+127)<<23 is IEEE754
 *            single-precision for 2^k.  If k=-127, we get IEEE754 0.
 *            If k=128, we get IEEE754 +inf.  If k<-127, k+127 is
 *            negative and we get screwed up.  If k>128, k+127
 *            overflows the 8-bit exponent and sets the sign bit.  So
 *            for x' (base 2) < -127.5 we must definitely return e^x ~
 *            0; for x' < 126.5 we're going to calculate 0 anyway
 *            (because k=floor(-126.5-epsilon+0.5) = -127).  So any
 *            minlogf between -126.5 log2 ... -127.5 log2 will suffice
 *            as the cutoff. Ditto for 126.5 log2 .. 127.5log2.
 *            That's 87.68312 .. 88.3762655.  I think Pommier's
 *            thinking is, you don't want to get to close to the
 *            edges, lest fp roundoff error screw you (he may have
 *            consider 1 ulp carefully, I can't tell), but otherwise
 *            you may as well put your bounds close to the outer edge;
 *            so
 *              maxlogf =  127.5 log(2) - epsilon
 *              minlogf = -127.5 log(2) + epsilon
 *            for an epsilon that happen to be ~ 3e-6.
 */
__m128
esl_sse_expf(__m128 x)
{
    static float cephes_p[6] = { 1.9875691500E-4f, 1.3981999507E-3f, 8.3334519073E-3f,
                                 4.1665795894E-2f, 1.6666665459E-1f, 5.0000001201E-1f
                               };
    static float cephes_c[2] = { 0.693359375f,    -2.12194440e-4f };
    static float maxlogf     =  88.3762626647949f;  /* 127.5 log(2) - epsilon. above this, 0.5+x/log2 gives k>128 and breaks 2^k "float" construction, because (k+127)<<23 must be a valid IEEE754 exponent 0..255 */
    static float minlogf     = -88.3762626647949f;  /*-127.5 log(2) + epsilon. below this, 0.5+x/log2 gives k<-127 and breaks 2^k, see above */
    __m128i k;
    __m128  mask, tmp, fx, z, y, minmask, maxmask;

    /* handle out-of-range and special conditions */
    maxmask = _mm_cmpgt_ps(x, _mm_set1_ps(maxlogf));
    minmask = _mm_cmple_ps(x, _mm_set1_ps(minlogf));

    /* range reduction: exp(x) = 2^k e^f = exp(f + k log 2); k = floorf(0.5 + x / log2): */
    fx = _mm_mul_ps(x,  _mm_set1_ps(eslCONST_LOG2R));
    fx = _mm_add_ps(fx, _mm_set1_ps(0.5f));

    /* floorf() with SSE:  */
    k    = _mm_cvttps_epi32(fx);	              /* cast to int with truncation                  */
    tmp  = _mm_cvtepi32_ps(k);	              /* cast back to float                           */
    mask = _mm_cmpgt_ps(tmp, fx);               /* if it increased (i.e. if it was negative...) */
    mask = _mm_and_ps(mask, _mm_set1_ps(1.0f)); /* ...without a conditional branch...           */
    fx   = _mm_sub_ps(tmp, mask);	              /* then subtract one.                           */
    k    = _mm_cvttps_epi32(fx);	              /* k is now ready for the 2^k part.             */

    /* polynomial approx for e^f for f in range [-0.5, 0.5] */
    tmp = _mm_mul_ps(fx, _mm_set1_ps(cephes_c[0]));
    z   = _mm_mul_ps(fx, _mm_set1_ps(cephes_c[1]));
    x   = _mm_sub_ps(x, tmp);
    x   = _mm_sub_ps(x, z);
    z   = _mm_mul_ps(x, x);

    y =               _mm_set1_ps(cephes_p[0]);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, _mm_set1_ps(cephes_p[1]));
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, _mm_set1_ps(cephes_p[2]));
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, _mm_set1_ps(cephes_p[3]));
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, _mm_set1_ps(cephes_p[4]));
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, _mm_set1_ps(cephes_p[5]));
    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, x);
    y = _mm_add_ps(y, _mm_set1_ps(1.0f));

    /* build 2^k by hand, by creating a IEEE754 float */
    k  = _mm_add_epi32(k, _mm_set1_epi32(127));
    k  = _mm_slli_epi32(k, 23);
    fx = _mm_castsi128_ps(k);

    /* put 2^k e^f together (fx = 2^k,  y = e^f) and we're done */
    y = _mm_mul_ps(y, fx);

    /* special/range cleanup */
    y = esl_sse_select_ps(y, _mm_set1_ps(eslINFINITY), maxmask); /* exp(x) = inf for x > log(2^128)  */
    y = esl_sse_select_ps(y, _mm_set1_ps(0.0f),        minmask); /* exp(x) = 0   for x < log(2^-149) */
    return y;
}
test (__m128 p)
{
  return _mm_cvttps_epi32 (p); 
}
Exemple #13
0
  {
    template<class Sig> struct result;
    template<class This,class A0>
    struct result<This(A0)>
      { typedef typename meta::as_integer<A0>::type type; };

    NT2_FUNCTOR_CALL_DISPATCH(
      1,
      typename nt2::meta::scalar_of<A0>::type,
      (3, (float,double,arithmetic_))
    )

    NT2_FUNCTOR_CALL_EVAL_IF(1,  float)
    {
      typedef typename NT2_CALL_RETURN_TYPE(1)::type type;
      type that =  {_mm_cvttps_epi32(a0)};
      return  that; 
    }
    NT2_FUNCTOR_CALL_EVAL_IF(1, double)
    {
      typedef typename NT2_CALL_RETURN_TYPE(1)::type type;
      const type v = {{a0[0],a0[1]}};
      return v;
    }
    NT2_FUNCTOR_CALL_EVAL_IF(1, arithmetic_)
    {
      return a0;
    }
  };
} }
#endif
static __m128i cielabv (union hvrgbpix rgb)
{
    __m128 xvxyz[2] = {_mm_set1_ps(0.5),_mm_set1_ps(0.5) }; //,0.5,0.5,0.5);

    __m128 vcam0 = _mm_setr_ps(cielab_xyz_cam[0][0],cielab_xyz_cam[1][0],cielab_xyz_cam[2][0],0);
    __m128 vcam1 = _mm_setr_ps(cielab_xyz_cam[0][1],cielab_xyz_cam[1][1],cielab_xyz_cam[2][1],0);
    __m128 vcam2 = _mm_setr_ps(cielab_xyz_cam[0][2],cielab_xyz_cam[1][2],cielab_xyz_cam[2][2],0);
    __m128 vrgb0h = _mm_set1_ps(rgb.h.c[0]);
    __m128 vrgb1h = _mm_set1_ps(rgb.h.c[1]);
    __m128 vrgb2h = _mm_set1_ps(rgb.h.c[2]);
    __m128 vrgb0v = _mm_set1_ps(rgb.v.c[0]);
    __m128 vrgb1v = _mm_set1_ps(rgb.v.c[1]);
    __m128 vrgb2v = _mm_set1_ps(rgb.v.c[2]);

    xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam0,vrgb0h));
    xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam1,vrgb1h));
    xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam2,vrgb2h));
    xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam0,vrgb0v));
    xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam1,vrgb1v));
    xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam2,vrgb2v));

    xvxyz[0] = _mm_max_ps(_mm_set1_ps(0),
                          _mm_min_ps(_mm_set1_ps(0xffff),
                                     _mm_round_ps(xvxyz[0], _MM_FROUND_TO_ZERO)));
    xvxyz[1] = _mm_max_ps(_mm_set1_ps(0),
                          _mm_min_ps(_mm_set1_ps(0xffff),
                                     _mm_round_ps(xvxyz[1], _MM_FROUND_TO_ZERO)));
    __m128i loadaddrh = _mm_cvttps_epi32(xvxyz[0]);
    __m128i loadaddrv = _mm_cvttps_epi32(xvxyz[1]);
#ifdef __AVX__
    __m256 vlab,
           vxyz = { cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrh,0)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                    0,
                    cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrv,0)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                    0},
           vxyz2 =  {0,
                     cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrh,2)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrh,0)],
                     0,
                     cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrv,2)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrv,0)]};

    vlab = _mm256_sub_ps(vxyz,vxyz2);
    vlab = _mm256_mul_ps(vlab, _mm256_setr_ps(116,500,200,0,116,500,200,0));
    vlab = _mm256_sub_ps(vlab, _mm256_setr_ps(16,0,0,0,16,0,0,0));
    vlab = _mm256_mul_ps(vlab,_mm256_set1_ps(64));
    vlab = _mm256_round_ps(vlab, _MM_FROUND_TO_ZERO);
    __m256i vlabi = _mm256_cvtps_epi32(vlab);
    return _mm_packs_epi32(_mm256_castsi256_si128(vlabi), ((__m128i*)&vlabi)[1]);
#else
    __m128 vlabh, vxyzh = {cielab_cbrt[_mm_extract_epi32(loadaddrh,0)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrh,2)],
                           0};
    __m128 vlabv, vxyzv = {cielab_cbrt[_mm_extract_epi32(loadaddrv,0)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrv,2)],
                           0};

    vlabh = _mm_sub_ps(_mm_shuffle_ps(vxyzh,vxyzh,_MM_SHUFFLE(0,1,0,1)),
                       _mm_shuffle_ps(vxyzh,vxyzh,_MM_SHUFFLE(0,2,1,3)));
    vlabh = _mm_mul_ps(vlabh,_mm_setr_ps(116,500,200,0));
    vlabh = _mm_sub_ps(vlabh,_mm_setr_ps(16,0,0,0));
    vlabh = _mm_mul_ps(vlabh,_mm_set_ps1(64));
    vlabh = _mm_round_ps(vlabh, _MM_FROUND_TO_ZERO);

    vlabv = _mm_sub_ps(_mm_shuffle_ps(vxyzv,vxyzv,_MM_SHUFFLE(0,1,0,1)),
                       _mm_shuffle_ps(vxyzv,vxyzv,_MM_SHUFFLE(0,2,1,3)));
    vlabv = _mm_mul_ps(vlabv,_mm_setr_ps(116,500,200,0));
    vlabv = _mm_sub_ps(vlabv,_mm_setr_ps(16,0,0,0));
    vlabv = _mm_mul_ps(vlabv,_mm_set_ps1(64));
    vlabv = _mm_round_ps(vlabv, _MM_FROUND_TO_ZERO);

    return _mm_set_epi64(_mm_cvtps_pi16(vlabv),_mm_cvtps_pi16(vlabh));
#endif
}
static void GF_FUNC_ALIGN VS_CC
proc_8bit_sse2(convolution_t *ch, uint8_t *buff, int bstride, int width,
               int height, int stride, uint8_t *dstp, const uint8_t *srcp)
{
    uint8_t *p0 = buff + 16;
    uint8_t *p1 = p0 + bstride;
    uint8_t *p2 = p1 + bstride;
    uint8_t *p3 = p2 + bstride;
    uint8_t *p4 = p3 + bstride;
    uint8_t *orig = p0, *end = p4;

    line_copy8(p0, srcp + 2 * stride , width, 2);
    line_copy8(p1, srcp + stride, width, 2);
    line_copy8(p2, srcp, width, 2);
    srcp += stride;
    line_copy8(p3, srcp, width, 2);

    __m128i zero = _mm_setzero_si128();
    __m128 rdiv = _mm_set1_ps((float)ch->rdiv);
    __m128 bias = _mm_set1_ps((float)ch->bias);
    __m128i matrix[25];
    for (int i = 0; i < 25; i++) {
        matrix[i] = _mm_unpacklo_epi16(_mm_set1_epi16((int16_t)ch->m[i]), zero);
    }

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 2 ? 1 : -1);
        line_copy8(p4, srcp, width, 2);
        uint8_t *array[] = {
            p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2,
            p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2,
            p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2,
            p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2,
            p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2
        };

        for (int x = 0; x < width; x += 16) {
            __m128i sum[4] = { zero, zero, zero, zero };

            for (int i = 0; i < 25; i++) {
                __m128i xmm0, xmm1, xmm2;

                xmm0 = _mm_loadu_si128((__m128i *)(array[i] + x));
                xmm2 = _mm_unpackhi_epi8(xmm0, zero);
                xmm0 = _mm_unpacklo_epi8(xmm0, zero);
                
                xmm1 = _mm_unpackhi_epi16(xmm0, zero);
                xmm0 = _mm_unpacklo_epi16(xmm0, zero);
                sum[0] = _mm_add_epi32(sum[0], _mm_madd_epi16(xmm0, matrix[i]));
                sum[1] = _mm_add_epi32(sum[1], _mm_madd_epi16(xmm1, matrix[i]));

                xmm1 = _mm_unpackhi_epi16(xmm2, zero);
                xmm0 = _mm_unpacklo_epi16(xmm2, zero);
                sum[2] = _mm_add_epi32(sum[2], _mm_madd_epi16(xmm0, matrix[i]));
                sum[3] = _mm_add_epi32(sum[3], _mm_madd_epi16(xmm1, matrix[i]));
            }

            for (int i = 0; i < 4; i++) {
                __m128 sumfp = _mm_cvtepi32_ps(sum[i]);
                sumfp = _mm_mul_ps(sumfp, rdiv);
                sumfp = _mm_add_ps(sumfp, bias);
                if (!ch->saturate) {
                    sumfp = mm_abs_ps(sumfp);
                }
                sum[i] = _mm_cvttps_epi32(sumfp);
            }

            sum[0] = _mm_packs_epi32(sum[0], sum[1]);
            sum[1] = _mm_packs_epi32(sum[2], sum[3]);
            sum[0] = _mm_packus_epi16(sum[0], sum[1]);

            _mm_store_si128((__m128i *)(dstp + x), sum[0]);
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = (p4 == end) ? orig : p4 + bstride;
    }
}
static void GF_FUNC_ALIGN VS_CC
proc_16bit_sse2(convolution_hv_t *ch, uint8_t *buff, int bstride, int width,
                int height, int stride, uint8_t *d, const uint8_t *s)
{
    const uint16_t *srcp = (uint16_t *)s;
    uint16_t *dstp = (uint16_t *)d;
    stride /= 2;
    bstride /= 2;

    uint16_t *p0 = (uint16_t *)buff + 8;
    uint16_t *p1 = p0 + bstride;
    uint16_t *p2 = p1 + bstride;
    uint16_t *p3 = p2 + bstride;
    uint16_t *p4 = p3 + bstride;
    uint16_t *orig = p0, *end = p4;

    line_copy16(p0, srcp + 2 * stride, width, 2);
    line_copy16(p1, srcp + stride, width, 2);
    line_copy16(p2, srcp, width, 2);
    srcp += stride;
    line_copy16(p3, srcp, width, 2);

    __m128i zero = _mm_setzero_si128();
    __m128i all1 = _mm_cmpeq_epi32(zero, zero);
    __m128i one = _mm_srli_epi32(all1, 31);
    __m128 rdiv_h = _mm_set1_ps((float)ch->rdiv_h);
    __m128 rdiv_v = _mm_set1_ps((float)ch->rdiv_v);
    __m128 bias = _mm_set1_ps((float)ch->bias);

    __m128i matrix_h[5];
    __m128i matrix_v[5];
    int sign_h[5];
    int sign_v[5];
    for (int i = 0; i < 5; i++) {
        sign_h[i] = ch->m_h[i] < 0 ? 1 : 0;
        sign_v[i] = ch->m_v[i] < 0 ? 1 : 0;
        uint16_t val = sign_h[i] ? (uint16_t)(ch->m_h[i] * -1) : (uint16_t)ch->m_h[i];
        matrix_h[i] = _mm_set1_epi16((int16_t)val);
        val = sign_v[i] ? (uint16_t)(ch->m_v[i] * -1) : (uint16_t)ch->m_v[i];
        matrix_v[i] = _mm_set1_epi16((int16_t)val);
    }

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 2 ? 1 : -1);
        line_copy16(p4, srcp, width, 2);

        for (int x = 0; x < width; x += 8) {
            uint16_t *array[] = {
                p0 + x, p1 + x, p2 + x, p3 + x, p4 + x,
                p2 + x - 2, p2 + x - 1, dstp + x, p2 + x + 1, p2 + x + 2
            };

            for (int j = 0; j < 2; j++) {
                __m128i *matrix = j == 0 ? matrix_v : matrix_h;
                int *sign = j == 0 ? sign_v : sign_h;
                __m128 rdiv = j == 0 ? rdiv_v : rdiv_h;
                __m128i sum[2];
                sum[0] = _mm_setzero_si128();
                sum[1] = _mm_setzero_si128();

                for (int i = 0; i < 5; i++) {
                    __m128i xmm0, xmm1, xmm2;

                    xmm0 = _mm_loadu_si128((__m128i *)array[i + j * 5]);

                    xmm1 = _mm_mullo_epi16(xmm0, matrix[i]);
                    xmm0 = _mm_mulhi_epu16(xmm0, matrix[i]);
                    xmm2 = _mm_unpacklo_epi16(xmm1, xmm0);
                    xmm0 = _mm_unpackhi_epi16(xmm1, xmm0);

                    if (sign[i]) {
                        xmm2 = _mm_add_epi32(one, _mm_xor_si128(xmm2, all1));
                        xmm0 = _mm_add_epi32(one, _mm_xor_si128(xmm0, all1));
                    }
                    sum[0] = _mm_add_epi32(sum[0], xmm2);
                    sum[1] = _mm_add_epi32(sum[1], xmm0);
                }

                for (int i = 0; i < 2; i++) {
                    __m128 sumfp;
                    __m128i mask, temp;
                    sumfp = _mm_cvtepi32_ps(sum[i]);
                    sumfp = _mm_mul_ps(sumfp, rdiv);
                    if (j == 1) {
                        sumfp = _mm_add_ps(sumfp, bias);
                    }
                    sum[i] = _mm_cvttps_epi32(sumfp);

                    temp = _mm_srli_epi32(all1, 16);
                    mask = _mm_cmplt_epi32(sum[i], temp);
                    sum[i] = _mm_or_si128(_mm_and_si128(sum[i], mask),
                                          _mm_andnot_si128(mask, temp));
                    mask = _mm_cmpgt_epi32(sum[i], zero);
                    if (ch->saturate) {
                        sum[i] = _mm_and_si128(mask, sum[i]);
                    } else {
                        temp = _mm_add_epi32(one, _mm_xor_si128(sum[i], all1));
                        sum[i] = _mm_or_si128(_mm_and_si128(mask, sum[i]),
                                              _mm_andnot_si128(mask, temp));
                    }
                }

                sum[0] = mm_cast_epi32(sum[0], sum[1]);

                _mm_store_si128((__m128i *)(dstp + x), sum[0]);
            }
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = (p4 == end) ? orig : p4 + bstride;
    }
}
Exemple #17
0
		SIMD_INLINE __m128i MulDiv32(__m128i dividend, __m128i divisor, const __m128 & KF_255_DIV_6)
		{
			return _mm_cvttps_epi32(_mm_div_ps(_mm_mul_ps(KF_255_DIV_6, _mm_cvtepi32_ps(dividend)), _mm_cvtepi32_ps(divisor)));
		}
Exemple #18
0
static inline __m128i softlight_byte_SSE2(const __m128i& sc, const __m128i& dc,
                                          const __m128i& sa, const __m128i& da) {
    __m128i tmp1, tmp2, tmp3;

    // int m = da ? dc * 256 / da : 0;
    __m128i cmp = _mm_cmpeq_epi32(da, _mm_setzero_si128());
    __m128i m = _mm_slli_epi32(dc, 8);
    __m128 x = _mm_cvtepi32_ps(m);
    __m128 y = _mm_cvtepi32_ps(da);
    m = _mm_cvttps_epi32(_mm_div_ps(x, y));
    m = _mm_andnot_si128(cmp, m);

    // if (2 * sc <= sa)
    tmp1 = _mm_slli_epi32(sc, 1);                      // 2 * sc
    __m128i cmp1 = _mm_cmpgt_epi32(tmp1, sa);
    tmp1 = _mm_sub_epi32(tmp1, sa);                    // 2 * sc - sa
    tmp2 = _mm_sub_epi32(_mm_set1_epi32(256), m);      // 256 - m
    tmp1 = Multiply32_SSE2(tmp1, tmp2);
    tmp1 = _mm_srai_epi32(tmp1, 8);
    tmp1 = _mm_add_epi32(sa, tmp1);
    tmp1 = Multiply32_SSE2(dc, tmp1);
    __m128i rc1 = _mm_andnot_si128(cmp1, tmp1);

    // else if (4 * dc <= da)
    tmp2 = _mm_slli_epi32(dc, 2);                      // dc * 4
    __m128i cmp2 = _mm_cmpgt_epi32(tmp2, da);
    __m128i i = _mm_slli_epi32(m, 2);                  // 4 * m
    __m128i j = _mm_add_epi32(i, _mm_set1_epi32(256)); // 4 * m + 256
    __m128i k = Multiply32_SSE2(i, j);                 // 4 * m * (4 * m + 256)
    __m128i t = _mm_sub_epi32(m, _mm_set1_epi32(256)); // m - 256
    i = Multiply32_SSE2(k, t);                         // 4 * m * (4 * m + 256) * (m - 256)
    i = _mm_srai_epi32(i, 16);                         // >> 16
    j = Multiply32_SSE2(_mm_set1_epi32(7), m);         // 7 * m
    tmp2 = _mm_add_epi32(i, j);
    i = Multiply32_SSE2(dc, sa);                       // dc * sa
    j = _mm_slli_epi32(sc, 1);                         // 2 * sc
    j = _mm_sub_epi32(j, sa);                          // 2 * sc - sa
    j = Multiply32_SSE2(da, j);                        // da * (2 * sc - sa)
    tmp2 = Multiply32_SSE2(j, tmp2);                   // * tmp
    tmp2 = _mm_srai_epi32(tmp2, 8);                    // >> 8
    tmp2 = _mm_add_epi32(i, tmp2);
    cmp = _mm_andnot_si128(cmp2, cmp1);
    __m128i rc2 = _mm_and_si128(cmp, tmp2);
    __m128i rc = _mm_or_si128(rc1, rc2);

    // else
    tmp3 = sqrt_unit_byte_SSE2(m);
    tmp3 = _mm_sub_epi32(tmp3, m);
    tmp3 = Multiply32_SSE2(j, tmp3);                   // j = da * (2 * sc - sa)
    tmp3 = _mm_srai_epi32(tmp3, 8);
    tmp3 = _mm_add_epi32(i, tmp3);                     // i = dc * sa
    cmp = _mm_and_si128(cmp1, cmp2);
    __m128i rc3 = _mm_and_si128(cmp, tmp3);
    rc = _mm_or_si128(rc, rc3);

    tmp1 = _mm_sub_epi32(_mm_set1_epi32(255), da);     // 255 - da
    tmp1 = _mm_mullo_epi16(sc, tmp1);
    tmp2 = _mm_sub_epi32(_mm_set1_epi32(255), sa);     // 255 - sa
    tmp2 = _mm_mullo_epi16(dc, tmp2);
    rc = _mm_add_epi32(rc, tmp1);
    rc = _mm_add_epi32(rc, tmp2);
    return clamp_div255round_SSE2(rc);
}
Exemple #19
0
/* evaluation of 4 sines at onces, using only SSE2.

   The code is the exact rewriting of the cephes sinf function.
   Precision is excellent as long as x < 8192 (I did not bother to
   take into account the special handling they have for greater values
   -- it does not return garbage for arguments over 8192, though, but
   the extra precision is missing).

   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
   surprising but correct result.

   Performance is also surprisingly good, 1.33 times faster than the
   macos vsinf SSE2 function, and 1.5 times faster than the
   __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
   too bad for an SSE1 function (with no special tuning) !
   However the latter libraries probably have a much better handling of NaN,
   Inf, denormalized and other special arguments..

   On my core 1 duo, the execution of this function takes approximately 95 cycles.

   From what I have observed on the experiments with Intel AMath lib, switching to an
   SSE2 version would improve the perf by only 10%.

   Since it is based on SSE intrinsics, it has to be compiled at -O2 to
   deliver full speed.
*/
__m128 sin_ps(__m128 x) { // any x
    typedef __m128 v4sf;
    typedef __m128i v4si;

    v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;

    v4si emm0, emm2;
    sign_bit = x;
    /* take the absolute value */
    x = _mm_and_ps(x, constants::inv_mant_mask.ps);
    /* extract the sign bit (upper one) */
    sign_bit = _mm_and_ps(sign_bit, constants::sign_mask.ps);

    /* scale by 4/Pi */
    y = _mm_mul_ps(x, constants::cephes_FOPI.ps);

    /* store the integer part of y in mm0 */
    emm2 = _mm_cvttps_epi32(y);
    /* j=(j+1) & (~1) (see the cephes sources) */
    emm2 = _mm_add_epi32(emm2, constants::pi32_1.pi);
    emm2 = _mm_and_si128(emm2, constants::pi32_inv1.pi);
    y = _mm_cvtepi32_ps(emm2);
    /* get the swap sign flag */
    emm0 = _mm_and_si128(emm2, constants::pi32_4.pi);
    emm0 = _mm_slli_epi32(emm0, 29);
    /* get the polynom selection mask
       there is one polynom for 0 <= x <= Pi/4
       and another one for Pi/4<x<=Pi/2

       Both branches will be computed.
    */
    emm2 = _mm_and_si128(emm2, constants::pi32_2.pi);
    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());

    v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
    v4sf poly_mask = _mm_castsi128_ps(emm2);
    sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);

    /* The magic pass: "******"
    x = ((x - y * DP1) - y * DP2) - y * DP3; */
    xmm1 = constants::minus_cephes_DP1.ps;
    xmm2 = constants::minus_cephes_DP2.ps;
    xmm3 = constants::minus_cephes_DP3.ps;
    xmm1 = _mm_mul_ps(y, xmm1);
    xmm2 = _mm_mul_ps(y, xmm2);
    xmm3 = _mm_mul_ps(y, xmm3);
    x = _mm_add_ps(x, xmm1);
    x = _mm_add_ps(x, xmm2);
    x = _mm_add_ps(x, xmm3);

    /* Evaluate the first polynom  (0 <= x <= Pi/4) */
    y = constants::coscof_p0.ps;
    v4sf z = _mm_mul_ps(x,x);

    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, constants::coscof_p1.ps);
    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, constants::coscof_p2.ps);
    y = _mm_mul_ps(y, z);
    y = _mm_mul_ps(y, z);
    v4sf tmp = _mm_mul_ps(z, constants::ps_0p5.ps);
    y = _mm_sub_ps(y, tmp);
    y = _mm_add_ps(y, constants::ps_1.ps);

    /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

    v4sf y2 = constants::sincof_p0.ps;
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_add_ps(y2, constants::sincof_p1.ps);
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_add_ps(y2, constants::sincof_p2.ps);
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_mul_ps(y2, x);
    y2 = _mm_add_ps(y2, x);

    /* select the correct result from the two polynoms */
    xmm3 = poly_mask;
    y2 = _mm_and_ps(xmm3, y2); //, xmm3);
    y = _mm_andnot_ps(xmm3, y);
    y = _mm_add_ps(y,y2);
    /* update the sign */
    y = _mm_xor_ps(y, sign_bit);

    return y;
}