void GSVector4i::operator = (const GSVector4& v) { m = _mm_cvttps_epi32(v); }
RETi CVT( const __m128 x ) { return _mm_cvttps_epi32(x); }
internal void MixSounds(SoundMixer *mixer, uint32 sampleFrequency, __m128 *realChannel0, __m128 *realChannel1, int32 sampleCount) { i32 sampleCount4 = sampleCount / 4; // simd globals __m128 zero = _mm_set1_ps(0.0f); __m128 one = _mm_set1_ps(1.0f); // get number of samples per second r32 secondsPerSample = 1.0f / (r32)sampleFrequency; // NOTE(Joey): clean channels with 0.0f before mixing for(int32 sampleIndex = 0; sampleIndex < sampleCount4; ++sampleIndex) { _mm_store_ps((float*)(realChannel0 + sampleIndex), zero); _mm_store_ps((float*)(realChannel1 + sampleIndex), zero); } // NOTE(Joey): loop over all playing sounds and mix into both real floating point channels for(PlayingSound **playingSoundPtr = &mixer->FirstPlayingSound; *playingSoundPtr; ) { PlayingSound *playingSound = *playingSoundPtr; bool32 soundFinished = false; Sound* sound = playingSound->Source; if(sound) { r32 volume0 = playingSound->CurrentVolume[0]; r32 volume1 = playingSound->CurrentVolume[1]; r32 dVolume0 = secondsPerSample*playingSound->dVolume[0]; r32 dVolume1 = secondsPerSample*playingSound->dVolume[1]; r32 dSample = playingSound->Pitch; __m128 *channel0 = realChannel0; __m128 *channel1 = realChannel1; __m128 masterVolume4 = _mm_set1_ps(mixer->MasterVolume); __m128 volume4_0 = _mm_setr_ps(volume0 + 0.0f*dVolume0, volume0 + 1.0f*dVolume0, volume0 + 2.0f*dVolume0, volume0 + 3.0f*dVolume0); __m128 volume4_1 = _mm_setr_ps(volume1 + 0.0f*dVolume1, volume1 + 1.0f*dVolume1, volume1 + 2.0f*dVolume1, volume1 + 3.0f*dVolume1); __m128 dVolume4_0 = _mm_set1_ps(4.0f*dVolume0); __m128 dVolume4_1 = _mm_set1_ps(4.0f*dVolume1); Assert(playingSound->SamplesPlayed >= 0); // NOTE(Joey): determine the maximum number of samples to mix this frame i32 nrSamplesToMix = sampleCount; // NOTE(Joey): deltaSampleRates can get negative (likely due to floating point precision) // this is accounted for in loop condition. i32 deltaSampleRates = (sound->SampleCount - RoundReal32ToInt32(playingSound->SamplesPlayed)); r32 realSamplesRemainingInSound = (r32)deltaSampleRates / (1.0f*dSample); i32 samplesRemainingInSound = RoundReal32ToInt32(realSamplesRemainingInSound); if(!playingSound->Loop && samplesRemainingInSound <= (i32)nrSamplesToMix) { nrSamplesToMix = samplesRemainingInSound; } // NOTE(Joey): determine if we need to break out of the loop early due to volume // attenuation reaching 0.0f volume per channel. bool32 volumeEnded[2] = {}; // TODO(Joey): make logic independent of nummer of channels // NOTE(Joey): could have a bug here where the volumeEnded (should) get triggered // for both volumes at the same time; which due to this logic will only work on one // volume item. I don't think this is a problem, but if so - this is likely it. if(dVolume0 != 0.0f) { r32 deltaVolume = playingSound->TargetVolume[0] - playingSound->CurrentVolume[0]; i32 volumeSampleCount = (i32)((deltaVolume / (1.0f*dVolume0)) + 0.5f); if(volumeSampleCount <= nrSamplesToMix) { nrSamplesToMix = volumeSampleCount; volumeEnded[0] = true; } } if(dVolume1 != 0.0f) { r32 deltaVolume = playingSound->TargetVolume[1] - playingSound->CurrentVolume[1]; i32 volumeSampleCount = (u32)((deltaVolume / (1.0f*dVolume1)) + 0.5f); if(volumeSampleCount <= nrSamplesToMix) { nrSamplesToMix = volumeSampleCount; volumeEnded[1] = true; } } // NOTE(Joey): we get into float precision issues; take expected begin/end sample // position and set playingSound->SamplePosition to expected end sample position. // then get next sample position in the loop loop index and start sample position. r32 beginSamplePosition = playingSound->SamplesPlayed; r32 endSamplePosition = beginSamplePosition + nrSamplesToMix*dSample; r32 loopIndexC = (endSamplePosition - beginSamplePosition) / (r32)nrSamplesToMix; // r32 samplePosition = playingSound->SamplesPlayed; // NOTE(Joey): clamp nrSamplesToMix loop condition to SIMD width of 4 for(i32 i = 0; i < nrSamplesToMix - (nrSamplesToMix & 3); i += 4) { r32 samplePosition = beginSamplePosition + loopIndexC*(r32)i; #if 0 // linear filtering // NOTE(Joey): disabled for now as it doesn't seem to make an 'audible' difference __m128 samplePos = _mm_setr_ps(samplePosition + 0.0f*dSample, samplePosition + 1.0f*dSample, samplePosition + 2.0f*dSample, samplePosition + 3.0f*dSample); __m128i sampleIndex = _mm_cvttps_epi32(samplePos); __m128 frac = _mm_sub_ps(samplePos, _mm_cvtepi32_ps(sampleIndex)); __m128 sampleValue0 = _mm_setr_ps(sound->Samples[0][((int32*)&sampleIndex)[0] % sound->SampleCount], sound->Samples[0][((int32*)&sampleIndex)[1] % sound->SampleCount], sound->Samples[0][((int32*)&sampleIndex)[2] % sound->SampleCount], sound->Samples[0][((int32*)&sampleIndex)[3] % sound->SampleCount]); __m128 sampleValue1 = _mm_setr_ps(sound->Samples[0][(((int32*)&sampleIndex)[0] + 1) % sound->SampleCount], sound->Samples[0][(((int32*)&sampleIndex)[1] + 1) % sound->SampleCount], sound->Samples[0][(((int32*)&sampleIndex)[2] + 1) % sound->SampleCount], sound->Samples[0][(((int32*)&sampleIndex)[3] + 1) % sound->SampleCount]); __m128 sampleValue = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(one, frac), sampleValue0), _mm_mul_ps(frac, sampleValue1)); #else // nearest-neighbor filtering __m128i sampleIndex = _mm_setr_epi32(RoundReal32ToInt32(samplePosition + 0.0f*dSample) % sound->SampleCount, RoundReal32ToInt32(samplePosition + 1.0f*dSample) % sound->SampleCount, RoundReal32ToInt32(samplePosition + 2.0f*dSample) % sound->SampleCount, RoundReal32ToInt32(samplePosition + 3.0f*dSample) % sound->SampleCount); __m128 sampleValue = _mm_setr_ps(sound->Samples[0][((i32*)&sampleIndex)[0]], sound->Samples[0][((i32*)&sampleIndex)[1]], sound->Samples[0][((i32*)&sampleIndex)[2]], sound->Samples[0][((i32*)&sampleIndex)[3]]); #endif // NOTE(Joey): write 4 SIMD wide __m128 d0 = _mm_load_ps((float*)&channel0[0]); __m128 d1 = _mm_load_ps((float*)&channel1[0]); d0 = _mm_add_ps(d0, _mm_mul_ps(_mm_mul_ps(masterVolume4, volume4_0), sampleValue)); d1 = _mm_add_ps(d1, _mm_mul_ps(_mm_mul_ps(masterVolume4, volume4_1), sampleValue)); _mm_store_ps((float*)&channel0[0], d0); _mm_store_ps((float*)&channel1[0], d1); ++channel0; ++channel1; volume4_0 = _mm_add_ps(volume4_0, dVolume4_0); volume4_1 = _mm_add_ps(volume4_1, dVolume4_1); // samplePosition += 4.0f*dSample; } // playingSound->SamplesPlayed = samplePosition; playingSound->SamplesPlayed = endSamplePosition; playingSound->CurrentVolume[0] = ((real32*)&volume4_0)[0]; playingSound->CurrentVolume[1] = ((real32*)&volume4_1)[0]; // NOTE(Joey): if volume 0.0f is reached due to attenuation, reset delta volume for(int32 i = 0; i < ArrayCount(volumeEnded); ++i) { if(volumeEnded[i]) { playingSound->CurrentVolume[i] = playingSound->TargetVolume[i]; playingSound->dVolume[i] = 0.0f; } } // if loop, re-position SamplesPlayed to start of sound sample0 if(playingSound->Loop && playingSound->SamplesPlayed >= (r32)sound->SampleCount) playingSound->SamplesPlayed = playingSound->SamplesPlayed - (r32)sound->SampleCount; soundFinished = !playingSound->Loop && (uint32)playingSound->SamplesPlayed >= sound->SampleCount; if(soundFinished) dSample = 0.0f; } else { // NOTE(Joey): Load sound here? or when retrieving from Asset manager; I'd say load sound when // not available in asset manager, much better path to take } if(soundFinished) { *playingSoundPtr = playingSound->Next; playingSound->Next = mixer->FirstFreePlayingSound; mixer->FirstFreePlayingSound = playingSound; } else { playingSoundPtr = &playingSound->Next; } } }
void sincos_ps(__m128 x, __m128 *s, __m128 *c) { __m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; __m128i emm0, emm2, emm4; sign_bit_sin = x; x = _mm_and_ps(x, *reinterpret_cast<const __m128*>(_pi_inv_sign_mask)); sign_bit_sin = _mm_and_ps(sign_bit_sin, *reinterpret_cast<const __m128*>(_pi_sign_mask)); y = _mm_mul_ps(x, *_ps_cephes_FOPI); emm2 = _mm_cvttps_epi32(y); emm2 = _mm_add_epi32(emm2, *_pi_1); emm2 = _mm_and_si128(emm2, *_pi_inv1); y = _mm_cvtepi32_ps(emm2); emm4 = emm2; emm0 = _mm_and_si128(emm2, *_pi_4); emm0 = _mm_slli_epi32(emm0, 29); __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0); emm2 = _mm_and_si128(emm2, *_pi_2); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); __m128 poly_mask = _mm_castsi128_ps(emm2); xmm1 = *_ps_minus_cephes_DP1; xmm2 = *_ps_minus_cephes_DP2; xmm3 = *_ps_minus_cephes_DP3; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); x = _mm_add_ps(x, xmm1); x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); emm4 = _mm_sub_epi32(emm4, *_pi_2); emm4 = _mm_andnot_si128(emm4, *_pi_4); emm4 = _mm_slli_epi32(emm4, 29); __m128 sign_bit_cos = _mm_castsi128_ps(emm4); sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); __m128 z = _mm_mul_ps(x, x); y = *_ps_coscof_p0; y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *_ps_coscof_p1); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *_ps_coscof_p2); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); __m128 tmp = _mm_mul_ps(z, *_ps_0p5); y = _mm_sub_ps(y, tmp); y = _mm_add_ps(y, *_ps_1); __m128 y2 = *_ps_sincof_p0; y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *_ps_sincof_p1); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *_ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); xmm3 = poly_mask; __m128 ysin2 = _mm_and_ps(xmm3, y2); __m128 ysin1 = _mm_andnot_ps(xmm3, y); y2 = _mm_sub_ps(y2, ysin2); y = _mm_sub_ps(y, ysin1); xmm1 = _mm_add_ps(ysin1, ysin2); xmm2 = _mm_add_ps(y, y2); *s = _mm_xor_ps(xmm1, sign_bit_sin); *c = _mm_xor_ps(xmm2, sign_bit_cos); }
/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them.. it is almost as fast, and gives you a free cosine with your sine */ void sincos_ps(__m128 x, __m128* s, __m128* c) { typedef __m128 v4sf; typedef __m128i v4si; v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; v4si emm0, emm2, emm4; sign_bit_sin = x; /* take the absolute value */ x = _mm_and_ps(x, constants::inv_sign_mask.ps); /* extract the sign bit (upper one) */ sign_bit_sin = _mm_and_ps(sign_bit_sin, constants::sign_mask.ps); /* scale by 4/Pi */ y = _mm_mul_ps(x, constants::cephes_FOPI.ps); /* store the integer part of y in emm2 */ emm2 = _mm_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm_add_epi32(emm2, constants::pi32_1.pi); emm2 = _mm_and_si128(emm2, constants::pi32_inv1.pi); y = _mm_cvtepi32_ps(emm2); emm4 = emm2; /* get the swap sign flag for the sine */ emm0 = _mm_and_si128(emm2, constants::pi32_4.pi); emm0 = _mm_slli_epi32(emm0, 29); v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0); /* get the polynom selection mask for the sine*/ emm2 = _mm_and_si128(emm2, constants::pi32_2.pi); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); v4sf poly_mask = _mm_castsi128_ps(emm2); /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = constants::minus_cephes_DP1.ps; xmm2 = constants::minus_cephes_DP2.ps; xmm3 = constants::minus_cephes_DP3.ps; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); x = _mm_add_ps(x, xmm1); x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); emm4 = _mm_sub_epi32(emm4, constants::pi32_2.pi); emm4 = _mm_andnot_si128(emm4, constants::pi32_4.pi); emm4 = _mm_slli_epi32(emm4, 29); v4sf sign_bit_cos = _mm_castsi128_ps(emm4); sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); /* Evaluate the first polynom (0 <= x <= Pi/4) */ v4sf z = _mm_mul_ps(x,x); y = constants::coscof_p0.ps; y = _mm_mul_ps(y, z); y = _mm_add_ps(y, constants::coscof_p1.ps); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, constants::coscof_p2.ps); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); v4sf tmp = _mm_mul_ps(z, constants::ps_0p5.ps); y = _mm_sub_ps(y, tmp); y = _mm_add_ps(y, constants::ps_1.ps); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ v4sf y2 = constants::sincof_p0.ps; y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, constants::sincof_p1.ps); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, constants::sincof_p2.ps); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; v4sf ysin2 = _mm_and_ps(xmm3, y2); v4sf ysin1 = _mm_andnot_ps(xmm3, y); y2 = _mm_sub_ps(y2,ysin2); y = _mm_sub_ps(y, ysin1); xmm1 = _mm_add_ps(ysin1,ysin2); xmm2 = _mm_add_ps(y,y2); /* update the sign */ *s = _mm_xor_ps(xmm1, sign_bit_sin); *c = _mm_xor_ps(xmm2, sign_bit_cos); }
std::unique_ptr<Occluder> Occluder::bake(const std::vector<__m128>& vertices, __m128 refMin, __m128 refMax) { assert(vertices.size() % 16 == 0); // Simple k-means clustering by normal direction to improve backface culling efficiency std::vector<__m128> quadNormals; for (auto i = 0; i < vertices.size(); i += 4) { auto v0 = vertices[i + 0]; auto v1 = vertices[i + 1]; auto v2 = vertices[i + 2]; auto v3 = vertices[i + 3]; quadNormals.push_back(normalize(_mm_add_ps(normal(v0, v1, v2), normal(v0, v2, v3)))); } std::vector<__m128> centroids; std::vector<uint32_t> centroidAssignment; centroids.push_back(_mm_setr_ps(+1.0f, 0.0f, 0.0f, 0.0f)); centroids.push_back(_mm_setr_ps(0.0f, +1.0f, 0.0f, 0.0f)); centroids.push_back(_mm_setr_ps(0.0f, 0.0f, +1.0f, 0.0f)); centroids.push_back(_mm_setr_ps(0.0f, -1.0f, 0.0f, 0.0f)); centroids.push_back(_mm_setr_ps(0.0f, 0.0f, -1.0f, 0.0f)); centroids.push_back(_mm_setr_ps(-1.0f, 0.0f, 0.0f, 0.0f)); centroidAssignment.resize(vertices.size() / 4); bool anyChanged = true; for (int iter = 0; iter < 10 && anyChanged; ++iter) { anyChanged = false; for (auto j = 0; j < quadNormals.size(); ++j) { __m128 normal = quadNormals[j]; __m128 bestDistance = _mm_set1_ps(-std::numeric_limits<float>::infinity()); int bestCentroid = -1; for (int k = 0; k < centroids.size(); ++k) { __m128 distance = _mm_dp_ps(centroids[k], normal, 0x7F); if (_mm_comige_ss(distance, bestDistance)) { bestDistance = distance; bestCentroid = k; } } if (centroidAssignment[j] != bestCentroid) { centroidAssignment[j] = bestCentroid; anyChanged = true; } } for (int k = 0; k < centroids.size(); ++k) { centroids[k] = _mm_setzero_ps(); } for (int j = 0; j < quadNormals.size(); ++j) { int k = centroidAssignment[j]; centroids[k] = _mm_add_ps(centroids[k], quadNormals[j]); } for (int k = 0; k < centroids.size(); ++k) { centroids[k] = normalize(centroids[k]); } } std::vector<__m128> orderedVertices; for (int k = 0; k < centroids.size(); ++k) { for (int j = 0; j < vertices.size() / 4; ++j) { if (centroidAssignment[j] == k) { orderedVertices.push_back(vertices[4 * j + 0]); orderedVertices.push_back(vertices[4 * j + 1]); orderedVertices.push_back(vertices[4 * j + 2]); orderedVertices.push_back(vertices[4 * j + 3]); } } } auto occluder = std::make_unique<Occluder>(); __m128 invExtents = _mm_div_ps(_mm_set1_ps(1.0f), _mm_sub_ps(refMax, refMin)); __m128 scalingX = _mm_set1_ps(2047.0f); __m128 scalingY = _mm_set1_ps(2047.0f); __m128 scalingZ = _mm_set1_ps(1023.0f); __m128 half = _mm_set1_ps(0.5f); for (size_t i = 0; i < orderedVertices.size(); i += 16) { for (auto j = 0; j < 4; ++j) { // Transform into [0,1] space relative to bounding box __m128 v0 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 0], refMin), invExtents); __m128 v1 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 4], refMin), invExtents); __m128 v2 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 8], refMin), invExtents); __m128 v3 = _mm_mul_ps(_mm_sub_ps(orderedVertices[i + j + 12], refMin), invExtents); // Transpose into [xxxx][yyyy][zzzz][wwww] _MM_TRANSPOSE4_PS(v0, v1, v2, v3); // Scale and truncate to int v0 = _mm_fmadd_ps(v0, scalingX, half); v1 = _mm_fmadd_ps(v1, scalingY, half); v2 = _mm_fmadd_ps(v2, scalingZ, half); __m128i X = _mm_cvttps_epi32(v0); __m128i Y = _mm_cvttps_epi32(v1); __m128i Z = _mm_cvttps_epi32(v2); // Pack to 11/11/10 format __m128i XYZ = _mm_or_si128(_mm_slli_epi32(X, 21), _mm_or_si128(_mm_slli_epi32(Y, 10), Z)); occluder->m_vertexData.push_back(XYZ); } } occluder->m_refMin = refMin; occluder->m_refMax = refMax; __m128 min = _mm_set1_ps(+std::numeric_limits<float>::infinity()); __m128 max = _mm_set1_ps(-std::numeric_limits<float>::infinity()); for (size_t i = 0; i < orderedVertices.size(); ++i) { min = _mm_min_ps(vertices[i], min); max = _mm_max_ps(vertices[i], max); } // Set W = 1 - this is expected by frustum culling code min = _mm_blend_ps(min, _mm_set1_ps(1.0f), 0b1000); max = _mm_blend_ps(max, _mm_set1_ps(1.0f), 0b1000); occluder->m_boundsMin = min; occluder->m_boundsMax = max; occluder->m_center = _mm_mul_ps(_mm_add_ps(max, min), _mm_set1_ps(0.5f)); return occluder; }
/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them.. it is almost as fast, and gives you a free cosine with your sine */ void sincos_ps(v4sfu *xptr, v4sfu *sptr, v4sfu *cptr) { __m128 x=*((__m128 *)xptr), *s=(__m128 *)sptr, *c=(__m128 *)cptr, xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; #ifdef USE_SSE2 __m128i emm0, emm2, emm4; #else __m64 mm0, mm1, mm2, mm3, mm4, mm5; #endif sign_bit_sin = x; /* take the absolute value */ x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask); /* extract the sign bit (upper one) */ sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask); /* scale by 4/Pi */ y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI); #ifdef USE_SSE2 /* store the integer part of y in emm2 */ emm2 = _mm_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1); emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1); y = _mm_cvtepi32_ps(emm2); emm4 = emm2; /* get the swap sign flag for the sine */ emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4); emm0 = _mm_slli_epi32(emm0, 29); __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0); /* get the polynom selection mask for the sine*/ emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); __m128 poly_mask = _mm_castsi128_ps(emm2); #else /* store the integer part of y in mm2:mm3 */ xmm3 = _mm_movehl_ps(xmm3, y); mm2 = _mm_cvttps_pi32(y); mm3 = _mm_cvttps_pi32(xmm3); /* j=(j+1) & (~1) (see the cephes sources) */ mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1); mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1); mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1); y = _mm_cvtpi32x2_ps(mm2, mm3); mm4 = mm2; mm5 = mm3; /* get the swap sign flag for the sine */ mm0 = _mm_and_si64(mm2, *(__m64*)_pi32_4); mm1 = _mm_and_si64(mm3, *(__m64*)_pi32_4); mm0 = _mm_slli_pi32(mm0, 29); mm1 = _mm_slli_pi32(mm1, 29); __m128 swap_sign_bit_sin; COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin); /* get the polynom selection mask for the sine */ mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2); mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); __m128 poly_mask; COPY_MM_TO_XMM(mm2, mm3, poly_mask); #endif /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(__m128*)_ps_minus_cephes_DP1; xmm2 = *(__m128*)_ps_minus_cephes_DP2; xmm3 = *(__m128*)_ps_minus_cephes_DP3; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); x = _mm_add_ps(x, xmm1); x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); #ifdef USE_SSE2 emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2); emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4); emm4 = _mm_slli_epi32(emm4, 29); __m128 sign_bit_cos = _mm_castsi128_ps(emm4); #else /* get the sign flag for the cosine */ mm4 = _mm_sub_pi32(mm4, *(__m64*)_pi32_2); mm5 = _mm_sub_pi32(mm5, *(__m64*)_pi32_2); mm4 = _mm_andnot_si64(mm4, *(__m64*)_pi32_4); mm5 = _mm_andnot_si64(mm5, *(__m64*)_pi32_4); mm4 = _mm_slli_pi32(mm4, 29); mm5 = _mm_slli_pi32(mm5, 29); __m128 sign_bit_cos; COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos); _mm_empty(); /* good-bye mmx */ #endif sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); /* Evaluate the first polynom (0 <= x <= Pi/4) */ __m128 z = _mm_mul_ps(x,x); y = *(__m128*)_ps_coscof_p0; y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5); y = _mm_sub_ps(y, tmp); y = _mm_add_ps(y, *(__m128*)_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ __m128 y2 = *(__m128*)_ps_sincof_p0; y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; __m128 ysin2 = _mm_and_ps(xmm3, y2); __m128 ysin1 = _mm_andnot_ps(xmm3, y); y2 = _mm_sub_ps(y2,ysin2); y = _mm_sub_ps(y, ysin1); xmm1 = _mm_add_ps(ysin1,ysin2); xmm2 = _mm_add_ps(y,y2); /* update the sign */ *s = _mm_xor_ps(xmm1, sign_bit_sin); *c = _mm_xor_ps(xmm2, sign_bit_cos); }
static void GF_FUNC_ALIGN VS_CC proc_8bit_sse2(convolution_hv_t *ch, uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *dstp, const uint8_t *srcp) { uint8_t *p0 = buff + 16; uint8_t *p1 = p0 + bstride; uint8_t *p2 = p1 + bstride; uint8_t *p3 = p2 + bstride; uint8_t *p4 = p3 + bstride; uint8_t *orig = p0, *end = p4; line_copy8(p0, srcp + 2 * stride, width, 2); line_copy8(p1, srcp + stride, width, 2); line_copy8(p2, srcp, width, 2); srcp += stride; line_copy8(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128i all1 = _mm_cmpeq_epi32(zero, zero); __m128i one = _mm_srli_epi16(all1, 15); __m128 rdiv_h = _mm_set1_ps((float)ch->rdiv_h); __m128 rdiv_v = _mm_set1_ps((float)ch->rdiv_v); __m128 bias = _mm_set1_ps((float)ch->bias); __m128i matrix_h[5]; __m128i matrix_v[5]; for (int i = 0; i < 5; i++) { matrix_h[i] = _mm_unpacklo_epi16(_mm_set1_epi16((int16_t)ch->m_h[i]), zero); matrix_v[i] = _mm_unpacklo_epi16(_mm_set1_epi16((int16_t)ch->m_v[i]), zero); } for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy8(p4, srcp, width, 2); for (int x = 0; x < width; x += 16) { uint8_t *array[] = { p0 + x, p1 + x, p2 + x, p3 + x, p4 + x, p2 + x - 2, p2 + x - 1, dstp + x, p2 + x + 1, p2 + x + 2 }; for (int j = 0; j < 2; j++) { __m128i *matrix = j == 0 ? matrix_v : matrix_h; __m128i sum[4]; sum[0] = _mm_setzero_si128(); sum[1] = _mm_setzero_si128(); sum[2] = _mm_setzero_si128(); sum[3] = _mm_setzero_si128(); for (int i = 0; i < 5; i++) { __m128i xmm0, xmm1, xmm2; xmm0 = _mm_loadu_si128((__m128i *)array[i + j * 5]); xmm2 = _mm_unpackhi_epi8(xmm0, zero); xmm0 = _mm_unpacklo_epi8(xmm0, zero); xmm1 = _mm_unpackhi_epi16(xmm0, zero); xmm0 = _mm_unpacklo_epi16(xmm0, zero); sum[0] = _mm_add_epi32(sum[0], _mm_madd_epi16(xmm0, matrix[i])); sum[1] = _mm_add_epi32(sum[1], _mm_madd_epi16(xmm1, matrix[i])); xmm1 = _mm_unpackhi_epi16(xmm2, zero); xmm0 = _mm_unpacklo_epi16(xmm2, zero); sum[2] = _mm_add_epi32(sum[2], _mm_madd_epi16(xmm0, matrix[i])); sum[3] = _mm_add_epi32(sum[3], _mm_madd_epi16(xmm1, matrix[i])); } for (int i = 0; i < 4; i++) { __m128 sumfp = _mm_cvtepi32_ps(sum[i]); sumfp = _mm_mul_ps(sumfp, j == 0 ? rdiv_v : rdiv_h); if (j == 1) { sumfp = _mm_add_ps(sumfp, bias); } sum[i] = _mm_cvttps_epi32(sumfp); } sum[0] = _mm_packs_epi32(sum[0], sum[1]); sum[1] = _mm_packs_epi32(sum[2], sum[3]); if (!ch->saturate) { for (int i = 0; i < 2; i++) { __m128i mask = _mm_cmplt_epi16(sum[i], zero); __m128i temp = _mm_add_epi16(one, _mm_xor_si128(sum[i], all1)); temp = _mm_and_si128(temp, mask); sum[i] = _mm_andnot_si128(mask, sum[i]); sum[i] = _mm_or_si128(sum[i], temp); } } sum[0] = _mm_packus_epi16(sum[0], sum[1]); _mm_store_si128((__m128i *)(dstp + x), sum[0]); } } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
__m128 exp_ps(v4sfu *xPtr) { __m128 x=*((__m128 *)xPtr); __m128 tmp = _mm_setzero_ps(), fx; #ifdef USE_SSE2 __m128i emm0; #else __m64 mm0, mm1; #endif __m128 one = *(__m128*)_ps_1; x = _mm_min_ps(x, *(__m128*)_ps_exp_hi); x = _mm_max_ps(x, *(__m128*)_ps_exp_lo); /* express exp(x) as exp(g + n*log(2)) */ fx = _mm_mul_ps(x, *(__m128*)_ps_cephes_LOG2EF); fx = _mm_add_ps(fx, *(__m128*)_ps_0p5); /* how to perform a floorf with SSE: just below */ #ifndef USE_SSE2 /* step 1 : cast to int */ tmp = _mm_movehl_ps(tmp, fx); mm0 = _mm_cvttps_pi32(fx); mm1 = _mm_cvttps_pi32(tmp); /* step 2 : cast back to float */ tmp = _mm_cvtpi32x2_ps(mm0, mm1); #else emm0 = _mm_cvttps_epi32(fx); tmp = _mm_cvtepi32_ps(emm0); #endif /* if greater, substract 1 */ __m128 mask = _mm_cmpgt_ps(tmp, fx); mask = _mm_and_ps(mask, one); fx = _mm_sub_ps(tmp, mask); tmp = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C1); __m128 z = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C2); x = _mm_sub_ps(x, tmp); x = _mm_sub_ps(x, z); z = _mm_mul_ps(x,x); __m128 y = *(__m128*)_ps_cephes_exp_p0; y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p1); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p2); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p3); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p4); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p5); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, x); y = _mm_add_ps(y, one); /* build 2^n */ #ifndef USE_SSE2 z = _mm_movehl_ps(z, fx); mm0 = _mm_cvttps_pi32(fx); mm1 = _mm_cvttps_pi32(z); mm0 = _mm_add_pi32(mm0, *(__m64*)_pi32_0x7f); mm1 = _mm_add_pi32(mm1, *(__m64*)_pi32_0x7f); mm0 = _mm_slli_pi32(mm0, 23); mm1 = _mm_slli_pi32(mm1, 23); __m128 pow2n; COPY_MM_TO_XMM(mm0, mm1, pow2n); _mm_empty(); #else emm0 = _mm_cvttps_epi32(fx); emm0 = _mm_add_epi32(emm0, *(__m128i*)_pi32_0x7f); emm0 = _mm_slli_epi32(emm0, 23); __m128 pow2n = _mm_castsi128_ps(emm0); #endif y = _mm_mul_ps(y, pow2n); return y; }
/* almost the same as sin_ps */ __m128 cos_ps(v4sfu *xPtr) { // any x __m128 x=*((__m128 *)xPtr); __m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, y; #ifdef USE_SSE2 __m128i emm0, emm2; #else __m64 mm0, mm1, mm2, mm3; #endif /* take the absolute value */ x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask); /* scale by 4/Pi */ y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI); #ifdef USE_SSE2 /* store the integer part of y in mm0 */ emm2 = _mm_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1); emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1); y = _mm_cvtepi32_ps(emm2); emm2 = _mm_sub_epi32(emm2, *(__m128i*)_pi32_2); /* get the swap sign flag */ emm0 = _mm_andnot_si128(emm2, *(__m128i*)_pi32_4); emm0 = _mm_slli_epi32(emm0, 29); /* get the polynom selection mask */ emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); __m128 sign_bit = _mm_castsi128_ps(emm0); __m128 poly_mask = _mm_castsi128_ps(emm2); #else /* store the integer part of y in mm0:mm1 */ xmm2 = _mm_movehl_ps(xmm2, y); mm2 = _mm_cvttps_pi32(y); mm3 = _mm_cvttps_pi32(xmm2); /* j=(j+1) & (~1) (see the cephes sources) */ mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1); mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1); mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1); y = _mm_cvtpi32x2_ps(mm2, mm3); mm2 = _mm_sub_pi32(mm2, *(__m64*)_pi32_2); mm3 = _mm_sub_pi32(mm3, *(__m64*)_pi32_2); /* get the swap sign flag in mm0:mm1 and the polynom selection mask in mm2:mm3 */ mm0 = _mm_andnot_si64(mm2, *(__m64*)_pi32_4); mm1 = _mm_andnot_si64(mm3, *(__m64*)_pi32_4); mm0 = _mm_slli_pi32(mm0, 29); mm1 = _mm_slli_pi32(mm1, 29); mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2); mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); __m128 sign_bit, poly_mask; COPY_MM_TO_XMM(mm0, mm1, sign_bit); COPY_MM_TO_XMM(mm2, mm3, poly_mask); _mm_empty(); /* good-bye mmx */ #endif /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(__m128*)_ps_minus_cephes_DP1; xmm2 = *(__m128*)_ps_minus_cephes_DP2; xmm3 = *(__m128*)_ps_minus_cephes_DP3; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); x = _mm_add_ps(x, xmm1); x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); /* Evaluate the first polynom (0 <= x <= Pi/4) */ y = *(__m128*)_ps_coscof_p0; __m128 z = _mm_mul_ps(x,x); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5); y = _mm_sub_ps(y, tmp); y = _mm_add_ps(y, *(__m128*)_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ __m128 y2 = *(__m128*)_ps_sincof_p0; y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; y2 = _mm_and_ps(xmm3, y2); //, xmm3); y = _mm_andnot_ps(xmm3, y); y = _mm_add_ps(y,y2); /* update the sign */ y = _mm_xor_ps(y, sign_bit); return y; }
/* Function: esl_sse_expf() * Synopsis: <r[z] = exp x[z]> * Incept: SRE, Fri Dec 14 14:46:27 2007 [Janelia] * * Purpose: Given a vector <x> containing four floats, returns a * vector <r> in which each element <r[z] = expf(x[z])>. * * Valid for all IEEE754 floats $x_z$. * * Xref: J2/71 * J10/62: bugfix, minlogf/maxlogf range was too wide; * (k+127) must be >=0 and <=255, so (k+127)<<23 * is a valid IEEE754 float, without touching * the sign bit. Pommier had this right in the * first place, and I didn't understand. * * Note: Derived from an SSE1 implementation by Julian * Pommier. Converted to SSE2. * * Note on maxlogf/minlogf, which are close to but not * exactly 127.5/log2 [J10/63]. We need -127<=k<=128, so * k+127 is 0..255, a valid IEEE754 8-bit exponent * (0..255), so the bit pattern (k+127)<<23 is IEEE754 * single-precision for 2^k. If k=-127, we get IEEE754 0. * If k=128, we get IEEE754 +inf. If k<-127, k+127 is * negative and we get screwed up. If k>128, k+127 * overflows the 8-bit exponent and sets the sign bit. So * for x' (base 2) < -127.5 we must definitely return e^x ~ * 0; for x' < 126.5 we're going to calculate 0 anyway * (because k=floor(-126.5-epsilon+0.5) = -127). So any * minlogf between -126.5 log2 ... -127.5 log2 will suffice * as the cutoff. Ditto for 126.5 log2 .. 127.5log2. * That's 87.68312 .. 88.3762655. I think Pommier's * thinking is, you don't want to get to close to the * edges, lest fp roundoff error screw you (he may have * consider 1 ulp carefully, I can't tell), but otherwise * you may as well put your bounds close to the outer edge; * so * maxlogf = 127.5 log(2) - epsilon * minlogf = -127.5 log(2) + epsilon * for an epsilon that happen to be ~ 3e-6. */ __m128 esl_sse_expf(__m128 x) { static float cephes_p[6] = { 1.9875691500E-4f, 1.3981999507E-3f, 8.3334519073E-3f, 4.1665795894E-2f, 1.6666665459E-1f, 5.0000001201E-1f }; static float cephes_c[2] = { 0.693359375f, -2.12194440e-4f }; static float maxlogf = 88.3762626647949f; /* 127.5 log(2) - epsilon. above this, 0.5+x/log2 gives k>128 and breaks 2^k "float" construction, because (k+127)<<23 must be a valid IEEE754 exponent 0..255 */ static float minlogf = -88.3762626647949f; /*-127.5 log(2) + epsilon. below this, 0.5+x/log2 gives k<-127 and breaks 2^k, see above */ __m128i k; __m128 mask, tmp, fx, z, y, minmask, maxmask; /* handle out-of-range and special conditions */ maxmask = _mm_cmpgt_ps(x, _mm_set1_ps(maxlogf)); minmask = _mm_cmple_ps(x, _mm_set1_ps(minlogf)); /* range reduction: exp(x) = 2^k e^f = exp(f + k log 2); k = floorf(0.5 + x / log2): */ fx = _mm_mul_ps(x, _mm_set1_ps(eslCONST_LOG2R)); fx = _mm_add_ps(fx, _mm_set1_ps(0.5f)); /* floorf() with SSE: */ k = _mm_cvttps_epi32(fx); /* cast to int with truncation */ tmp = _mm_cvtepi32_ps(k); /* cast back to float */ mask = _mm_cmpgt_ps(tmp, fx); /* if it increased (i.e. if it was negative...) */ mask = _mm_and_ps(mask, _mm_set1_ps(1.0f)); /* ...without a conditional branch... */ fx = _mm_sub_ps(tmp, mask); /* then subtract one. */ k = _mm_cvttps_epi32(fx); /* k is now ready for the 2^k part. */ /* polynomial approx for e^f for f in range [-0.5, 0.5] */ tmp = _mm_mul_ps(fx, _mm_set1_ps(cephes_c[0])); z = _mm_mul_ps(fx, _mm_set1_ps(cephes_c[1])); x = _mm_sub_ps(x, tmp); x = _mm_sub_ps(x, z); z = _mm_mul_ps(x, x); y = _mm_set1_ps(cephes_p[0]); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[1])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[2])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[3])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[4])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[5])); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(1.0f)); /* build 2^k by hand, by creating a IEEE754 float */ k = _mm_add_epi32(k, _mm_set1_epi32(127)); k = _mm_slli_epi32(k, 23); fx = _mm_castsi128_ps(k); /* put 2^k e^f together (fx = 2^k, y = e^f) and we're done */ y = _mm_mul_ps(y, fx); /* special/range cleanup */ y = esl_sse_select_ps(y, _mm_set1_ps(eslINFINITY), maxmask); /* exp(x) = inf for x > log(2^128) */ y = esl_sse_select_ps(y, _mm_set1_ps(0.0f), minmask); /* exp(x) = 0 for x < log(2^-149) */ return y; }
test (__m128 p) { return _mm_cvttps_epi32 (p); }
{ template<class Sig> struct result; template<class This,class A0> struct result<This(A0)> { typedef typename meta::as_integer<A0>::type type; }; NT2_FUNCTOR_CALL_DISPATCH( 1, typename nt2::meta::scalar_of<A0>::type, (3, (float,double,arithmetic_)) ) NT2_FUNCTOR_CALL_EVAL_IF(1, float) { typedef typename NT2_CALL_RETURN_TYPE(1)::type type; type that = {_mm_cvttps_epi32(a0)}; return that; } NT2_FUNCTOR_CALL_EVAL_IF(1, double) { typedef typename NT2_CALL_RETURN_TYPE(1)::type type; const type v = {{a0[0],a0[1]}}; return v; } NT2_FUNCTOR_CALL_EVAL_IF(1, arithmetic_) { return a0; } }; } } #endif
static __m128i cielabv (union hvrgbpix rgb) { __m128 xvxyz[2] = {_mm_set1_ps(0.5),_mm_set1_ps(0.5) }; //,0.5,0.5,0.5); __m128 vcam0 = _mm_setr_ps(cielab_xyz_cam[0][0],cielab_xyz_cam[1][0],cielab_xyz_cam[2][0],0); __m128 vcam1 = _mm_setr_ps(cielab_xyz_cam[0][1],cielab_xyz_cam[1][1],cielab_xyz_cam[2][1],0); __m128 vcam2 = _mm_setr_ps(cielab_xyz_cam[0][2],cielab_xyz_cam[1][2],cielab_xyz_cam[2][2],0); __m128 vrgb0h = _mm_set1_ps(rgb.h.c[0]); __m128 vrgb1h = _mm_set1_ps(rgb.h.c[1]); __m128 vrgb2h = _mm_set1_ps(rgb.h.c[2]); __m128 vrgb0v = _mm_set1_ps(rgb.v.c[0]); __m128 vrgb1v = _mm_set1_ps(rgb.v.c[1]); __m128 vrgb2v = _mm_set1_ps(rgb.v.c[2]); xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam0,vrgb0h)); xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam1,vrgb1h)); xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam2,vrgb2h)); xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam0,vrgb0v)); xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam1,vrgb1v)); xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam2,vrgb2v)); xvxyz[0] = _mm_max_ps(_mm_set1_ps(0), _mm_min_ps(_mm_set1_ps(0xffff), _mm_round_ps(xvxyz[0], _MM_FROUND_TO_ZERO))); xvxyz[1] = _mm_max_ps(_mm_set1_ps(0), _mm_min_ps(_mm_set1_ps(0xffff), _mm_round_ps(xvxyz[1], _MM_FROUND_TO_ZERO))); __m128i loadaddrh = _mm_cvttps_epi32(xvxyz[0]); __m128i loadaddrv = _mm_cvttps_epi32(xvxyz[1]); #ifdef __AVX__ __m256 vlab, vxyz = { cielab_cbrt[_mm_extract_epi32(loadaddrh,1)], cielab_cbrt[_mm_extract_epi32(loadaddrh,0)], cielab_cbrt[_mm_extract_epi32(loadaddrh,1)], 0, cielab_cbrt[_mm_extract_epi32(loadaddrv,1)], cielab_cbrt[_mm_extract_epi32(loadaddrv,0)], cielab_cbrt[_mm_extract_epi32(loadaddrv,1)], 0}, vxyz2 = {0, cielab_cbrt[_mm_extract_epi32(loadaddrh,1)], cielab_cbrt[_mm_extract_epi32(loadaddrh,2)], cielab_cbrt[_mm_extract_epi32(loadaddrh,0)], 0, cielab_cbrt[_mm_extract_epi32(loadaddrv,1)], cielab_cbrt[_mm_extract_epi32(loadaddrv,2)], cielab_cbrt[_mm_extract_epi32(loadaddrv,0)]}; vlab = _mm256_sub_ps(vxyz,vxyz2); vlab = _mm256_mul_ps(vlab, _mm256_setr_ps(116,500,200,0,116,500,200,0)); vlab = _mm256_sub_ps(vlab, _mm256_setr_ps(16,0,0,0,16,0,0,0)); vlab = _mm256_mul_ps(vlab,_mm256_set1_ps(64)); vlab = _mm256_round_ps(vlab, _MM_FROUND_TO_ZERO); __m256i vlabi = _mm256_cvtps_epi32(vlab); return _mm_packs_epi32(_mm256_castsi256_si128(vlabi), ((__m128i*)&vlabi)[1]); #else __m128 vlabh, vxyzh = {cielab_cbrt[_mm_extract_epi32(loadaddrh,0)], cielab_cbrt[_mm_extract_epi32(loadaddrh,1)], cielab_cbrt[_mm_extract_epi32(loadaddrh,2)], 0}; __m128 vlabv, vxyzv = {cielab_cbrt[_mm_extract_epi32(loadaddrv,0)], cielab_cbrt[_mm_extract_epi32(loadaddrv,1)], cielab_cbrt[_mm_extract_epi32(loadaddrv,2)], 0}; vlabh = _mm_sub_ps(_mm_shuffle_ps(vxyzh,vxyzh,_MM_SHUFFLE(0,1,0,1)), _mm_shuffle_ps(vxyzh,vxyzh,_MM_SHUFFLE(0,2,1,3))); vlabh = _mm_mul_ps(vlabh,_mm_setr_ps(116,500,200,0)); vlabh = _mm_sub_ps(vlabh,_mm_setr_ps(16,0,0,0)); vlabh = _mm_mul_ps(vlabh,_mm_set_ps1(64)); vlabh = _mm_round_ps(vlabh, _MM_FROUND_TO_ZERO); vlabv = _mm_sub_ps(_mm_shuffle_ps(vxyzv,vxyzv,_MM_SHUFFLE(0,1,0,1)), _mm_shuffle_ps(vxyzv,vxyzv,_MM_SHUFFLE(0,2,1,3))); vlabv = _mm_mul_ps(vlabv,_mm_setr_ps(116,500,200,0)); vlabv = _mm_sub_ps(vlabv,_mm_setr_ps(16,0,0,0)); vlabv = _mm_mul_ps(vlabv,_mm_set_ps1(64)); vlabv = _mm_round_ps(vlabv, _MM_FROUND_TO_ZERO); return _mm_set_epi64(_mm_cvtps_pi16(vlabv),_mm_cvtps_pi16(vlabh)); #endif }
static void GF_FUNC_ALIGN VS_CC proc_8bit_sse2(convolution_t *ch, uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *dstp, const uint8_t *srcp) { uint8_t *p0 = buff + 16; uint8_t *p1 = p0 + bstride; uint8_t *p2 = p1 + bstride; uint8_t *p3 = p2 + bstride; uint8_t *p4 = p3 + bstride; uint8_t *orig = p0, *end = p4; line_copy8(p0, srcp + 2 * stride , width, 2); line_copy8(p1, srcp + stride, width, 2); line_copy8(p2, srcp, width, 2); srcp += stride; line_copy8(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128 rdiv = _mm_set1_ps((float)ch->rdiv); __m128 bias = _mm_set1_ps((float)ch->bias); __m128i matrix[25]; for (int i = 0; i < 25; i++) { matrix[i] = _mm_unpacklo_epi16(_mm_set1_epi16((int16_t)ch->m[i]), zero); } for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy8(p4, srcp, width, 2); uint8_t *array[] = { p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2, p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2, p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2, p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2, p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2 }; for (int x = 0; x < width; x += 16) { __m128i sum[4] = { zero, zero, zero, zero }; for (int i = 0; i < 25; i++) { __m128i xmm0, xmm1, xmm2; xmm0 = _mm_loadu_si128((__m128i *)(array[i] + x)); xmm2 = _mm_unpackhi_epi8(xmm0, zero); xmm0 = _mm_unpacklo_epi8(xmm0, zero); xmm1 = _mm_unpackhi_epi16(xmm0, zero); xmm0 = _mm_unpacklo_epi16(xmm0, zero); sum[0] = _mm_add_epi32(sum[0], _mm_madd_epi16(xmm0, matrix[i])); sum[1] = _mm_add_epi32(sum[1], _mm_madd_epi16(xmm1, matrix[i])); xmm1 = _mm_unpackhi_epi16(xmm2, zero); xmm0 = _mm_unpacklo_epi16(xmm2, zero); sum[2] = _mm_add_epi32(sum[2], _mm_madd_epi16(xmm0, matrix[i])); sum[3] = _mm_add_epi32(sum[3], _mm_madd_epi16(xmm1, matrix[i])); } for (int i = 0; i < 4; i++) { __m128 sumfp = _mm_cvtepi32_ps(sum[i]); sumfp = _mm_mul_ps(sumfp, rdiv); sumfp = _mm_add_ps(sumfp, bias); if (!ch->saturate) { sumfp = mm_abs_ps(sumfp); } sum[i] = _mm_cvttps_epi32(sumfp); } sum[0] = _mm_packs_epi32(sum[0], sum[1]); sum[1] = _mm_packs_epi32(sum[2], sum[3]); sum[0] = _mm_packus_epi16(sum[0], sum[1]); _mm_store_si128((__m128i *)(dstp + x), sum[0]); } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
static void GF_FUNC_ALIGN VS_CC proc_16bit_sse2(convolution_hv_t *ch, uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *d, const uint8_t *s) { const uint16_t *srcp = (uint16_t *)s; uint16_t *dstp = (uint16_t *)d; stride /= 2; bstride /= 2; uint16_t *p0 = (uint16_t *)buff + 8; uint16_t *p1 = p0 + bstride; uint16_t *p2 = p1 + bstride; uint16_t *p3 = p2 + bstride; uint16_t *p4 = p3 + bstride; uint16_t *orig = p0, *end = p4; line_copy16(p0, srcp + 2 * stride, width, 2); line_copy16(p1, srcp + stride, width, 2); line_copy16(p2, srcp, width, 2); srcp += stride; line_copy16(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128i all1 = _mm_cmpeq_epi32(zero, zero); __m128i one = _mm_srli_epi32(all1, 31); __m128 rdiv_h = _mm_set1_ps((float)ch->rdiv_h); __m128 rdiv_v = _mm_set1_ps((float)ch->rdiv_v); __m128 bias = _mm_set1_ps((float)ch->bias); __m128i matrix_h[5]; __m128i matrix_v[5]; int sign_h[5]; int sign_v[5]; for (int i = 0; i < 5; i++) { sign_h[i] = ch->m_h[i] < 0 ? 1 : 0; sign_v[i] = ch->m_v[i] < 0 ? 1 : 0; uint16_t val = sign_h[i] ? (uint16_t)(ch->m_h[i] * -1) : (uint16_t)ch->m_h[i]; matrix_h[i] = _mm_set1_epi16((int16_t)val); val = sign_v[i] ? (uint16_t)(ch->m_v[i] * -1) : (uint16_t)ch->m_v[i]; matrix_v[i] = _mm_set1_epi16((int16_t)val); } for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy16(p4, srcp, width, 2); for (int x = 0; x < width; x += 8) { uint16_t *array[] = { p0 + x, p1 + x, p2 + x, p3 + x, p4 + x, p2 + x - 2, p2 + x - 1, dstp + x, p2 + x + 1, p2 + x + 2 }; for (int j = 0; j < 2; j++) { __m128i *matrix = j == 0 ? matrix_v : matrix_h; int *sign = j == 0 ? sign_v : sign_h; __m128 rdiv = j == 0 ? rdiv_v : rdiv_h; __m128i sum[2]; sum[0] = _mm_setzero_si128(); sum[1] = _mm_setzero_si128(); for (int i = 0; i < 5; i++) { __m128i xmm0, xmm1, xmm2; xmm0 = _mm_loadu_si128((__m128i *)array[i + j * 5]); xmm1 = _mm_mullo_epi16(xmm0, matrix[i]); xmm0 = _mm_mulhi_epu16(xmm0, matrix[i]); xmm2 = _mm_unpacklo_epi16(xmm1, xmm0); xmm0 = _mm_unpackhi_epi16(xmm1, xmm0); if (sign[i]) { xmm2 = _mm_add_epi32(one, _mm_xor_si128(xmm2, all1)); xmm0 = _mm_add_epi32(one, _mm_xor_si128(xmm0, all1)); } sum[0] = _mm_add_epi32(sum[0], xmm2); sum[1] = _mm_add_epi32(sum[1], xmm0); } for (int i = 0; i < 2; i++) { __m128 sumfp; __m128i mask, temp; sumfp = _mm_cvtepi32_ps(sum[i]); sumfp = _mm_mul_ps(sumfp, rdiv); if (j == 1) { sumfp = _mm_add_ps(sumfp, bias); } sum[i] = _mm_cvttps_epi32(sumfp); temp = _mm_srli_epi32(all1, 16); mask = _mm_cmplt_epi32(sum[i], temp); sum[i] = _mm_or_si128(_mm_and_si128(sum[i], mask), _mm_andnot_si128(mask, temp)); mask = _mm_cmpgt_epi32(sum[i], zero); if (ch->saturate) { sum[i] = _mm_and_si128(mask, sum[i]); } else { temp = _mm_add_epi32(one, _mm_xor_si128(sum[i], all1)); sum[i] = _mm_or_si128(_mm_and_si128(mask, sum[i]), _mm_andnot_si128(mask, temp)); } } sum[0] = mm_cast_epi32(sum[0], sum[1]); _mm_store_si128((__m128i *)(dstp + x), sum[0]); } } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
SIMD_INLINE __m128i MulDiv32(__m128i dividend, __m128i divisor, const __m128 & KF_255_DIV_6) { return _mm_cvttps_epi32(_mm_div_ps(_mm_mul_ps(KF_255_DIV_6, _mm_cvtepi32_ps(dividend)), _mm_cvtepi32_ps(divisor))); }
static inline __m128i softlight_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { __m128i tmp1, tmp2, tmp3; // int m = da ? dc * 256 / da : 0; __m128i cmp = _mm_cmpeq_epi32(da, _mm_setzero_si128()); __m128i m = _mm_slli_epi32(dc, 8); __m128 x = _mm_cvtepi32_ps(m); __m128 y = _mm_cvtepi32_ps(da); m = _mm_cvttps_epi32(_mm_div_ps(x, y)); m = _mm_andnot_si128(cmp, m); // if (2 * sc <= sa) tmp1 = _mm_slli_epi32(sc, 1); // 2 * sc __m128i cmp1 = _mm_cmpgt_epi32(tmp1, sa); tmp1 = _mm_sub_epi32(tmp1, sa); // 2 * sc - sa tmp2 = _mm_sub_epi32(_mm_set1_epi32(256), m); // 256 - m tmp1 = Multiply32_SSE2(tmp1, tmp2); tmp1 = _mm_srai_epi32(tmp1, 8); tmp1 = _mm_add_epi32(sa, tmp1); tmp1 = Multiply32_SSE2(dc, tmp1); __m128i rc1 = _mm_andnot_si128(cmp1, tmp1); // else if (4 * dc <= da) tmp2 = _mm_slli_epi32(dc, 2); // dc * 4 __m128i cmp2 = _mm_cmpgt_epi32(tmp2, da); __m128i i = _mm_slli_epi32(m, 2); // 4 * m __m128i j = _mm_add_epi32(i, _mm_set1_epi32(256)); // 4 * m + 256 __m128i k = Multiply32_SSE2(i, j); // 4 * m * (4 * m + 256) __m128i t = _mm_sub_epi32(m, _mm_set1_epi32(256)); // m - 256 i = Multiply32_SSE2(k, t); // 4 * m * (4 * m + 256) * (m - 256) i = _mm_srai_epi32(i, 16); // >> 16 j = Multiply32_SSE2(_mm_set1_epi32(7), m); // 7 * m tmp2 = _mm_add_epi32(i, j); i = Multiply32_SSE2(dc, sa); // dc * sa j = _mm_slli_epi32(sc, 1); // 2 * sc j = _mm_sub_epi32(j, sa); // 2 * sc - sa j = Multiply32_SSE2(da, j); // da * (2 * sc - sa) tmp2 = Multiply32_SSE2(j, tmp2); // * tmp tmp2 = _mm_srai_epi32(tmp2, 8); // >> 8 tmp2 = _mm_add_epi32(i, tmp2); cmp = _mm_andnot_si128(cmp2, cmp1); __m128i rc2 = _mm_and_si128(cmp, tmp2); __m128i rc = _mm_or_si128(rc1, rc2); // else tmp3 = sqrt_unit_byte_SSE2(m); tmp3 = _mm_sub_epi32(tmp3, m); tmp3 = Multiply32_SSE2(j, tmp3); // j = da * (2 * sc - sa) tmp3 = _mm_srai_epi32(tmp3, 8); tmp3 = _mm_add_epi32(i, tmp3); // i = dc * sa cmp = _mm_and_si128(cmp1, cmp2); __m128i rc3 = _mm_and_si128(cmp, tmp3); rc = _mm_or_si128(rc, rc3); tmp1 = _mm_sub_epi32(_mm_set1_epi32(255), da); // 255 - da tmp1 = _mm_mullo_epi16(sc, tmp1); tmp2 = _mm_sub_epi32(_mm_set1_epi32(255), sa); // 255 - sa tmp2 = _mm_mullo_epi16(dc, tmp2); rc = _mm_add_epi32(rc, tmp1); rc = _mm_add_epi32(rc, tmp2); return clamp_div255round_SSE2(rc); }
/* evaluation of 4 sines at onces, using only SSE2. The code is the exact rewriting of the cephes sinf function. Precision is excellent as long as x < 8192 (I did not bother to take into account the special handling they have for greater values -- it does not return garbage for arguments over 8192, though, but the extra precision is missing). Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the surprising but correct result. Performance is also surprisingly good, 1.33 times faster than the macos vsinf SSE2 function, and 1.5 times faster than the __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not too bad for an SSE1 function (with no special tuning) ! However the latter libraries probably have a much better handling of NaN, Inf, denormalized and other special arguments.. On my core 1 duo, the execution of this function takes approximately 95 cycles. From what I have observed on the experiments with Intel AMath lib, switching to an SSE2 version would improve the perf by only 10%. Since it is based on SSE intrinsics, it has to be compiled at -O2 to deliver full speed. */ __m128 sin_ps(__m128 x) { // any x typedef __m128 v4sf; typedef __m128i v4si; v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y; v4si emm0, emm2; sign_bit = x; /* take the absolute value */ x = _mm_and_ps(x, constants::inv_mant_mask.ps); /* extract the sign bit (upper one) */ sign_bit = _mm_and_ps(sign_bit, constants::sign_mask.ps); /* scale by 4/Pi */ y = _mm_mul_ps(x, constants::cephes_FOPI.ps); /* store the integer part of y in mm0 */ emm2 = _mm_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm_add_epi32(emm2, constants::pi32_1.pi); emm2 = _mm_and_si128(emm2, constants::pi32_inv1.pi); y = _mm_cvtepi32_ps(emm2); /* get the swap sign flag */ emm0 = _mm_and_si128(emm2, constants::pi32_4.pi); emm0 = _mm_slli_epi32(emm0, 29); /* get the polynom selection mask there is one polynom for 0 <= x <= Pi/4 and another one for Pi/4<x<=Pi/2 Both branches will be computed. */ emm2 = _mm_and_si128(emm2, constants::pi32_2.pi); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); v4sf swap_sign_bit = _mm_castsi128_ps(emm0); v4sf poly_mask = _mm_castsi128_ps(emm2); sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit); /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = constants::minus_cephes_DP1.ps; xmm2 = constants::minus_cephes_DP2.ps; xmm3 = constants::minus_cephes_DP3.ps; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); x = _mm_add_ps(x, xmm1); x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); /* Evaluate the first polynom (0 <= x <= Pi/4) */ y = constants::coscof_p0.ps; v4sf z = _mm_mul_ps(x,x); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, constants::coscof_p1.ps); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, constants::coscof_p2.ps); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); v4sf tmp = _mm_mul_ps(z, constants::ps_0p5.ps); y = _mm_sub_ps(y, tmp); y = _mm_add_ps(y, constants::ps_1.ps); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ v4sf y2 = constants::sincof_p0.ps; y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, constants::sincof_p1.ps); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, constants::sincof_p2.ps); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; y2 = _mm_and_ps(xmm3, y2); //, xmm3); y = _mm_andnot_ps(xmm3, y); y = _mm_add_ps(y,y2); /* update the sign */ y = _mm_xor_ps(y, sign_bit); return y; }