void sub_ssememcpy(void* _Dst, const void* _Src, size_t size) { assert(IS_16BYTE_ALIGNMENT(_Dst)); assert(IS_16BYTE_ALIGNMENT(_Src)); float* dst = (float*)_Dst; float* src = (float*)_Src; int loop_num = size >> 6; for (int i = 0; i < loop_num; i++) { //load 64byte data __m128 xmm0 = _mm_load_ps(src + 0); __m128 xmm1 = _mm_load_ps(src + 4); __m128 xmm2 = _mm_load_ps(src + 8); __m128 xmm3 = _mm_load_ps(src + 12); //store 64byte data //_mm_store_ps(dst + 0, xmm0); //_mm_store_ps(dst + 4, xmm1); //_mm_store_ps(dst + 8, xmm2); //_mm_store_ps(dst + 12, xmm3); _mm_stream_si128((__m128i*)(dst + 0), _mm_castps_si128(xmm0)); _mm_stream_si128((__m128i*)(dst + 4), _mm_castps_si128(xmm1)); _mm_stream_si128((__m128i*)(dst + 8), _mm_castps_si128(xmm2)); _mm_stream_si128((__m128i*)(dst + 12), _mm_castps_si128(xmm3)); dst += 16; src += 16; } memcpy(dst, src, size & 0x3F); }
// ============================================================= // ====================== RGB2BGR_32F ========================== // ============================================================= // NOT WORKING void _rgb2bgr_32f(const float* _src, float* _dest, unsigned int _width, unsigned int _pitchs, unsigned int _pitchd, unsigned int _start, unsigned int _stop) { #ifdef USE_SSE // This is the number of 3*16 blocks assigned to the thread const unsigned int widthz = (_pitchs/3) >> 2; // Get start positions for buffers const float* tsrc; float* tdest; for( unsigned int y=_start; y<=_stop; ++y ) { tsrc = _src+(y*_pitchs); tdest = _dest+(y*_pitchd); for( unsigned int x=0; x<widthz; ++x ) { const __m128i v0 = _mm_load_si128((const __m128i*)tsrc); tsrc+=4; const __m128i v1 = _mm_load_si128((const __m128i*)tsrc); tsrc+=4; const __m128i v2 = _mm_load_si128((const __m128i*)tsrc); tsrc+=4; // Shuffle bits within each vector __m128i r0t = _mm_castps_si128(_mm_shuffle_ps( _mm_castsi128_ps(v0), _mm_castsi128_ps(v0), _MM_SHUFFLE(3,0,1,2) ) ); __m128i r0 = _mm_castps_si128(_mm_shuffle_ps( _mm_castsi128_ps(r0t), _mm_castsi128_ps(v1), _MM_SHUFFLE(3,2,1,0) )); __m128i r1t = _mm_castps_si128(_mm_shuffle_ps( _mm_castsi128_ps(v1), _mm_castsi128_ps(v1), _MM_SHUFFLE(3,2,1,0) )); __m128i r2 = _mm_castps_si128(_mm_shuffle_ps( _mm_castsi128_ps(v2), _mm_castsi128_ps(v2), _MM_SHUFFLE(3,2,1,0) )); // _mm_store_si128( (__m128i*)tdest, r0 ); tdest+=4; _mm_store_si128( (__m128i*)tdest, r1t ); tdest+=4; _mm_store_si128( (__m128i*)tdest, r2 ); tdest+=4; } } #else const float* tsrc; float* tdest; for( unsigned int y=_start; y<=_stop; ++y ) { tsrc = _src+(y*_pitchs); tdest = _dest+(y*_pitchd); for( unsigned int x=0; x<_width; ++x ) { float t = tsrc[3*x]; tdest[3*x] = tsrc[3*x+2]; tdest[3*x+2] = t; } } #endif }
static inline __m128 log2f4(__m128 x) { __m128i exp = _mm_load_si128((__m128i*)_exp_mask); __m128i mant = _mm_load_si128((__m128i*)_mantissa_mask); __m128 one = _mm_load_ps(_ones_ps); __m128i i = _mm_castps_si128(x); __m128 e = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, exp), 23), _mm_load_si128((__m128i*)_one27))); __m128 m = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mant)), one); __m128 p; /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ */ #if LOG_POLY_DEGREE == 6 p = POLY5( m, log_p5_0, log_p5_1, log_p5_2, log_p5_3, log_p5_4, log_p5_5); #elif LOG_POLY_DEGREE == 5 p = POLY4(m, log_p4_0, log_p4_1, log_p4_2, log_p4_3, log_p4_4); #elif LOG_POLY_DEGREE == 4 p = POLY3(m, log_p3_0, log_p3_1, log_p3_2, log_p3_3); #elif LOG_POLY_DEGREE == 3 p = POLY2(m, log_p2_0, log_p2_1, log_p2_2); #else #error #endif /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/ p = _mm_mul_ps(p, _mm_sub_ps(m, one)); return _mm_add_ps(p, e); }
void mandel_sse2(unsigned char *image, const struct spec *s) { __m128 xmin = _mm_set_ps1(s->xlim[0]); __m128 ymin = _mm_set_ps1(s->ylim[0]); __m128 xscale = _mm_set_ps1((s->xlim[1] - s->xlim[0]) / s->width); __m128 yscale = _mm_set_ps1((s->ylim[1] - s->ylim[0]) / s->height); __m128 threshold = _mm_set_ps1(4); __m128 one = _mm_set_ps1(1); __m128i zero = _mm_setzero_si128(); __m128 iter_scale = _mm_set_ps1(1.0f / s->iterations); __m128 depth_scale = _mm_set_ps1(s->depth - 1); #pragma omp parallel for schedule(dynamic, 1) for (int y = 0; y < s->height; y++) { for (int x = 0; x < s->width; x += 4) { __m128 mx = _mm_set_ps(x + 3, x + 2, x + 1, x + 0); __m128 my = _mm_set_ps1(y); __m128 cr = _mm_add_ps(_mm_mul_ps(mx, xscale), xmin); __m128 ci = _mm_add_ps(_mm_mul_ps(my, yscale), ymin); __m128 zr = cr; __m128 zi = ci; int k = 1; __m128 mk = _mm_set_ps1(k); while (++k < s->iterations) { /* Compute z1 from z0 */ __m128 zr2 = _mm_mul_ps(zr, zr); __m128 zi2 = _mm_mul_ps(zi, zi); __m128 zrzi = _mm_mul_ps(zr, zi); /* zr1 = zr0 * zr0 - zi0 * zi0 + cr */ /* zi1 = zr0 * zi0 + zr0 * zi0 + ci */ zr = _mm_add_ps(_mm_sub_ps(zr2, zi2), cr); zi = _mm_add_ps(_mm_add_ps(zrzi, zrzi), ci); /* Increment k */ zr2 = _mm_mul_ps(zr, zr); zi2 = _mm_mul_ps(zi, zi); __m128 mag2 = _mm_add_ps(zr2, zi2); __m128 mask = _mm_cmplt_ps(mag2, threshold); mk = _mm_add_ps(_mm_and_ps(mask, one), mk); /* Early bailout? */ __m128i maski = _mm_castps_si128(mask); if (0xFFFF == _mm_movemask_epi8(_mm_cmpeq_epi8(maski, zero))) break; } mk = _mm_mul_ps(mk, iter_scale); mk = _mm_sqrt_ps(mk); mk = _mm_mul_ps(mk, depth_scale); __m128i pixels = _mm_cvtps_epi32(mk); unsigned char *dst = image + y * s->width * 3 + x * 3; unsigned char *src = (unsigned char *)&pixels; for (int i = 0; i < 4; i++) { dst[i * 3 + 0] = src[i * 4]; dst[i * 3 + 1] = src[i * 4]; dst[i * 3 + 2] = src[i * 4]; } } } }
Iu32vec4 mandel_4(F32vec4 &c_re4, F32vec4 &c_im4, int max_iterations) { F32vec4 z_re4 = c_re4; F32vec4 z_im4 = c_im4; F32vec4 four4(4.0f); F32vec4 two4(2.0f); Iu32vec4 count4(0,0,0,0); Iu32vec4 one4(1,1,1,1); int i; for (i = 0; i < max_iterations; ++i) { F32vec4 z_re24 = z_re4 * z_re4; F32vec4 z_im24 = z_im4 * z_im4; F32vec4 mf4 = cmplt(z_re24 + z_im24, four4); Iu32vec4 mi4 (_mm_castps_si128((__m128)mf4)); if (is_zero (mi4)) { break; } F32vec4 new_re4 = z_re24 - z_im24; F32vec4 new_im4 = two4 * z_re4 * z_im4; z_re4 = c_re4 + new_re4; z_im4 = c_im4 + new_im4; count4 = count4 + (mi4 & one4); } return count4; }
HW_FORCE_INLINE Vec<N> inverse(const Vec<N>& a) { __m128 x = _mm_rcp_ps(a.xmm); if (N != 4) { // Clear unused components to 0 x = _mm_castsi128_ps(_mm_slli_si128(_mm_srli_si128(_mm_castps_si128(x), (4-N)*4), (4-N)*4)); } return Vec<N>(x); }
//same thing, but processes 4 pixels simultaneously using SSE2 intrinsics //probably can be made faster, but I'm no expert at low level code :) void mandelbrotKernelSSE2(__m128 re, __m128 im, unsigned char *out_color) { const __m128 escape = _mm_set_ps(9.0f, 9.0f, 9.0f, 9.0f); __m128i iter_inc = _mm_set_epi32(1, 1, 1, 1); __m128 z_re = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f); __m128 z_im = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f); __m128 z_re2 = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f); __m128 z_im2 = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f); __m128i iter_mask = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff); __m128i iter = _mm_set_epi32(0, 0, 0, 0); int i = 0; int iter_mask_v[4]; for(i=0; i < 32; i++) { z_im = _mm_mul_ps(z_re, z_im); z_im = _mm_add_ps(z_im, z_im); z_im = _mm_add_ps(z_im, im); z_re = _mm_sub_ps(z_re2, z_im2); z_re = _mm_add_ps(z_re, re); z_re2 = _mm_mul_ps(z_re, z_re); z_im2 = _mm_mul_ps(z_im, z_im); __m128 iter_mask_new=_mm_cmplt_ps(_mm_add_ps(z_re2, z_im2), escape); iter_mask = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(iter_mask), iter_mask_new)); iter_inc = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(iter_inc), _mm_castsi128_ps(iter_mask))); iter = _mm_add_epi32(iter, iter_inc); //not sure if it really speeds up the code, we are doing conditional based on //SSE2 register, probably there's much better way to do it _mm_storeu_ps((float*)iter_mask_v, _mm_castsi128_ps(iter_mask)); if(!(iter_mask_v[0] || iter_mask_v[1] || iter_mask_v[2] || iter_mask_v[3])) { break; } } int iters[4]; _mm_storeu_ps((float*)iters, _mm_castsi128_ps(iter)); for(i=0;i<4;i++) { unsigned char col = (iters[3 - i] == 32) ? 255 : (iters[3 - i] * 8); *out_color ++= col; *out_color ++= col; *out_color ++= col; *out_color ++= 255; } }
BOOST_FORCEINLINE __m128i shuffle(__m128i const lower, __m128i const upper) { return _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps(lower), _mm_castsi128_ps(upper) , _MM_SHUFFLE(upper_i1, upper_i0, lower_i1, lower_i0) ) ); }
void Detect32f(const HidHaarCascade & hid, size_t offset, const __m128 & norm, __m128i & result) { typedef HidHaarCascade Hid; const float * leaves = hid.leaves.data(); const Hid::Node * node = hid.nodes.data(); const Hid::Stage * stages = hid.stages.data(); for (int i = 0, n = (int)hid.stages.size(); i < n; ++i) { const Hid::Stage & stage = stages[i]; if (stage.canSkip) continue; const Hid::Node * end = node + stage.ntrees; __m128 stageSum = _mm_setzero_ps(); if (stage.hasThree) { for (; node < end; ++node, leaves += 2) { const Hid::Feature & feature = hid.features[node->featureIdx]; __m128 sum = _mm_add_ps(WeightedSum32f(feature.rect[0], offset), WeightedSum32f(feature.rect[1], offset)); if (feature.rect[2].p0) sum = _mm_add_ps(sum, WeightedSum32f(feature.rect[2], offset)); StageSum32f(leaves, node->threshold, sum, norm, stageSum); } } else { for (; node < end; ++node, leaves += 2) { const Hid::Feature & feature = hid.features[node->featureIdx]; __m128 sum = _mm_add_ps(WeightedSum32f(feature.rect[0], offset), WeightedSum32f(feature.rect[1], offset)); StageSum32f(leaves, node->threshold, sum, norm, stageSum); } } result = _mm_andnot_si128(_mm_castps_si128(_mm_cmpgt_ps(_mm_set1_ps(stage.threshold), stageSum)), result); int resultCount = ResultCount(result); if (resultCount == 0) { return; } else if (resultCount == 1) { uint32_t SIMD_ALIGNED(16) _result[4]; float SIMD_ALIGNED(16) _norm[4]; _mm_store_si128((__m128i*)_result, result); _mm_store_ps(_norm, norm); for (int j = 0; j < 4; ++j) { if (_result[j]) { _result[j] = Base::Detect32f(hid, offset + j, i + 1, _norm[j]) > 0 ? 1 : 0; break; } } result = _mm_load_si128((__m128i*)_result); return; } }
static float Atan(float y, float x) // #include <emmintrin.h>, #include <xmmintrin.h> { const __m128 _ps_atan_p0 = _mm_set1_ps(-0.0464964749f); const __m128 _ps_atan_p1 = _mm_set1_ps(0.15931422f); const __m128 _ps_atan_p2 = _mm_set1_ps(0.327622764f); const __m128 _ps_pi = _mm_set1_ps(pi); const __m128 _ps_pi0p5 = _mm_set1_ps(pi0p5); const __m128 _mask_sign_raw = _mm_castsi128_ps(_mm_set1_epi32( 0x80000000)); const __m128 _mask_sign_inv = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000)); __m128 mm1, mm2, mm3; __m128 axm, aym; __m128 xm = _mm_set1_ps(x); __m128 ym = _mm_set1_ps(y); axm = _mm_and_ps(xm, _mask_sign_inv); aym = _mm_and_ps(ym, _mask_sign_inv); mm1 = _mm_min_ps(axm, aym); mm2 = _mm_max_ps(axm, aym); mm1 = _mm_div_ps(mm1, mm2); mm2 = _mm_mul_ps(mm1, mm1); mm3 = _mm_mul_ps(mm2, _ps_atan_p0); mm3 = _mm_add_ps(mm3, _ps_atan_p1); mm3 = _mm_mul_ps(mm3, mm2); mm3 = _mm_sub_ps(mm3, _ps_atan_p2); mm3 = _mm_mul_ps(mm3, mm2); mm3 = _mm_mul_ps(mm3, mm1); mm3 = _mm_add_ps(mm3, mm1); __m128 mask; /* |y| > |x| */ mask = _mm_cmpgt_ss(aym, axm); mm2 = _mm_and_ps(_ps_pi0p5, mask); mm1 = _mm_and_ps(_mask_sign_raw, mask); mm3 = _mm_xor_ps(mm3, mm1); mm3 = _mm_add_ps(mm3, mm2); /* x < 0 */ mask = _mm_and_ps(xm, _mask_sign_raw); mm3 = _mm_xor_ps(mm3, mask); mm1 = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(mm3), 30)); mm1 = _mm_and_ps(_ps_pi, mm1); mm3 = _mm_add_ps(mm3, mm1); /* y < 0 */ mm1 = _mm_and_ps(ym, _mask_sign_raw); mm3 = _mm_xor_ps(mm3, mm1); return _mm_cvtss_f32(mm3); }
template <bool align> SIMD_INLINE void HogDirectionHistograms(const __m128 & dx, const __m128 & dy, Buffer & buffer, size_t col) { __m128 bestDot = _mm_setzero_ps(); __m128i bestIndex = _mm_setzero_si128(); for(int i = 0; i < buffer.size; ++i) { __m128 dot = _mm_add_ps(_mm_mul_ps(dx, buffer.cos[i]), _mm_mul_ps(dy, buffer.sin[i])); __m128 mask = _mm_cmpgt_ps(dot, bestDot); bestDot = _mm_max_ps(dot, bestDot); bestIndex = Combine(_mm_castps_si128(mask), buffer.pos[i], bestIndex); dot = _mm_sub_ps(_mm_setzero_ps(), dot); mask = _mm_cmpgt_ps(dot, bestDot); bestDot = _mm_max_ps(dot, bestDot); bestIndex = Combine(_mm_castps_si128(mask), buffer.neg[i], bestIndex); } Store<align>((__m128i*)(buffer.index + col), bestIndex); Sse::Store<align>(buffer.value + col, _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(dx, dx), _mm_mul_ps(dy, dy)))); }
void dif_ssememcpy(void* _Dst, const void* _Src, size_t size) { assert(IS_16BYTE_ALIGNMENT(_Src)); float* dst = (float*)_Dst; float* src = (float*)_Src; __m128 xmm0, xmm1, xmm2, xmm3, xmm4; int loop_num = size >> 6; xmm0 = _mm_load_ps(src + 0); _mm_storeu_ps(dst + 0, xmm0); dst = (float*)((int)dst + _SHIFT); __m128i xmm0i = _mm_srli_si128(_mm_castps_si128(xmm0), _SHIFT); //xmm0 >> _SHIFT for (int i = 0; i < loop_num; i++) { xmm1 = _mm_load_ps(src + 4); xmm3 = _mm_load_ps(src + 8); xmm2 = xmm1; xmm4 = xmm3; __m128i xmm1i = _mm_slli_si128(_mm_castps_si128(xmm1), 16 - _SHIFT); //xmm1 << (16 - _SHIFT) __m128i xmm2i = _mm_srli_si128(_mm_castps_si128(xmm2), _SHIFT); //xmm2 >> _SHIFT __m128i xmm3i = _mm_slli_si128(_mm_castps_si128(xmm3), 16 - _SHIFT); //xmm3 << (16 - _SHIFT) __m128i xmm4i = _mm_srli_si128(_mm_castps_si128(xmm4), _SHIFT); //xmm4 >> _SHIFT xmm1i = _mm_or_si128(xmm1i, xmm0i); xmm3i = _mm_or_si128(xmm3i, xmm2i); _mm_store_ps(dst + 0, _mm_castsi128_ps(xmm1i)); _mm_store_ps(dst + 4, _mm_castsi128_ps(xmm3i)); xmm1 = _mm_load_ps(src + 12); xmm3 = _mm_load_ps(src + 16); xmm2 = xmm1; xmm0 = xmm3; xmm1i = _mm_slli_si128(_mm_castps_si128(xmm1), 16 - _SHIFT); //xmm1 << (16 - _SHIFT) xmm2i = _mm_srli_si128(_mm_castps_si128(xmm2), _SHIFT); //xmm2 >> _SHIFT xmm3i = _mm_slli_si128(_mm_castps_si128(xmm3), 16 - _SHIFT); //xmm3 << (16 - _SHIFT) xmm0i = _mm_srli_si128(_mm_castps_si128(xmm0), _SHIFT); //xmm0 >> _SHIFT xmm1i = _mm_or_si128(xmm1i, xmm4i); xmm3i = _mm_or_si128(xmm3i, xmm2i); _mm_store_ps(dst + 8, _mm_castsi128_ps(xmm1i)); _mm_store_ps(dst + 12, _mm_castsi128_ps(xmm3i)); dst += 16; src += 16; } memcpy((void*)((int)dst - _SHIFT), src, size & 0x3F); }
static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width, int source_dx) { __m128i xmm0, xmmY1, xmmY2; __m128 xmmY; uint8 u, v, y; int x = 0; while (width >= 2) { u = u_buf[x >> 17]; v = v_buf[x >> 17]; y = y_buf[x >> 16]; x += source_dx; xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); xmmY1 = _mm_adds_epi16(xmmY1, xmm0); y = y_buf[x >> 16]; x += source_dx; xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); xmmY2 = _mm_adds_epi16(xmmY2, xmm0); xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), 0x44); xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); rgb_buf += 8; width -= 2; } if (width) { u = u_buf[x >> 17]; v = v_buf[x >> 17]; y = y_buf[x >> 16]; xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); xmmY1 = _mm_adds_epi16(xmmY1, xmm0); xmmY1 = _mm_srai_epi16(xmmY1, 6); xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); } }
void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride) { /* comp_pred and pred must be 16 byte aligned. */ assert(((intptr_t)comp_pred & 0xf) == 0); assert(((intptr_t)pred & 0xf) == 0); if (width > 8) { int x, y; for (y = 0; y < height; ++y) { for (x = 0; x < width; x += 16) { const __m128i p = _mm_load_si128((const __m128i *)(pred + x)); const __m128i r = _mm_loadu_si128((const __m128i *)(ref + x)); const __m128i avg = _mm_avg_epu8(p, r); _mm_store_si128((__m128i *)(comp_pred + x), avg); } comp_pred += width; pred += width; ref += ref_stride; } } else { // width must be 4 or 8. int i; // Process 16 elements at a time. comp_pred and pred have width == stride // and therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are // all divisible by 16 so just ref needs to be massaged when loading. for (i = 0; i < width * height; i += 16) { const __m128i p = _mm_load_si128((const __m128i *)pred); __m128i r; __m128i avg; if (width == ref_stride) { r = _mm_loadu_si128((const __m128i *)ref); ref += 16; } else if (width == 4) { r = _mm_set_epi32(loadu_uint32(ref + 3 * ref_stride), loadu_uint32(ref + 2 * ref_stride), loadu_uint32(ref + ref_stride), loadu_uint32(ref)); ref += 4 * ref_stride; } else { const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref); assert(width == 8); r = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(r_0), (const __m64 *)(ref + ref_stride))); ref += 2 * ref_stride; } avg = _mm_avg_epu8(p, r); _mm_store_si128((__m128i *)comp_pred, avg); pred += 16; comp_pred += 16; } } }
static void GF_FUNC_ALIGN VS_CC float_to_dst_16bit(const float *srcp, uint8_t *d, int width, int height, int src_stride, int dst_stride, float th, int bits) { uint16_t *dstp = (uint16_t *)d; dst_stride /= 2; __m128 tmax = _mm_set1_ps(th); int rshift = 32 - bits; for (int y = 0; y < height; y++) { for (int x = 0; x < width; x += 8) { __m128 xmf0 = _mm_cmpge_ps(_mm_load_ps(srcp + x), tmax); __m128 xmf1 = _mm_cmpge_ps(_mm_load_ps(srcp + x + 4), tmax); __m128i xmi0 = _mm_srli_epi32(_mm_castps_si128(xmf0), rshift); __m128i xmi1 = _mm_srli_epi32(_mm_castps_si128(xmf1), rshift); xmi0 = mm_cast_epi32(xmi0, xmi1); _mm_store_si128((__m128i *)(dstp + x), xmi0); } srcp += src_stride; dstp += dst_stride; } }
static void GF_FUNC_ALIGN VS_CC float_to_dst_8bit(const float *srcp, uint8_t *dstp, int width, int height, int src_stride, int dst_stride, float th, int bits) { __m128 tmax = _mm_set1_ps(th); for (int y = 0; y < height; y++) { for (int x = 0; x < width; x += 16) { __m128 xmf0 = _mm_cmpge_ps(_mm_load_ps(srcp + x), tmax); __m128 xmf1 = _mm_cmpge_ps(_mm_load_ps(srcp + x + 4), tmax); __m128 xmf2 = _mm_cmpge_ps(_mm_load_ps(srcp + x + 8), tmax); __m128 xmf3 = _mm_cmpge_ps(_mm_load_ps(srcp + x + 12), tmax); __m128i xmi0 = _mm_packs_epi32(_mm_castps_si128(xmf0), _mm_castps_si128(xmf1)); __m128i xmi1 = _mm_packs_epi32(_mm_castps_si128(xmf2), _mm_castps_si128(xmf3)); xmi0 = _mm_packs_epi16(xmi0, xmi1); _mm_store_si128((__m128i *)(dstp + x), xmi0); } srcp += src_stride; dstp += dst_stride; } }
inline FORCE_INLINE __m128i mm_cvtps_ph(__m128 x) { __m128 magic = _mm_castsi128_ps(_mm_set1_epi32((uint32_t)15 << 23)); __m128i inf = _mm_set1_epi32((uint32_t)255UL << 23); __m128i f16inf = _mm_set1_epi32((uint32_t)31UL << 23); __m128i sign_mask = _mm_set1_epi32(0x80000000UL); __m128i round_mask = _mm_set1_epi32(~0x0FFFU); __m128i ret_0x7E00 = _mm_set1_epi32(0x7E00); __m128i ret_0x7C00 = _mm_set1_epi32(0x7C00); __m128i f, sign, ge_inf, eq_inf; f = _mm_castps_si128(x); sign = _mm_and_si128(f, sign_mask); f = _mm_xor_si128(f, sign); ge_inf = _mm_cmpgt_epi32(f, inf); eq_inf = _mm_cmpeq_epi32(f, inf); f = _mm_and_si128(f, round_mask); f = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(f), magic)); f = _mm_sub_epi32(f, round_mask); f = mm_min_epi32(f, f16inf); f = _mm_srli_epi32(f, 13); f = mm_blendv_ps(ret_0x7E00, f, ge_inf); f = mm_blendv_ps(ret_0x7C00, f, eq_inf); sign = _mm_srli_epi32(sign, 16); f = _mm_or_si128(f, sign); f = mm_packus_epi32(f, _mm_setzero_si128()); return f; }
__m128 log_ps(__m128 x) { __m128i emm0; __m128 one = *_ps_1; __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps()); x = _mm_max_ps(x, *reinterpret_cast<const __m128*>(_pi_min_norm_pos)); emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23); x = _mm_and_ps(x, *reinterpret_cast<const __m128*>(_pi_inv_mant_mask)); x = _mm_or_ps(x, *_ps_0p5); emm0 = _mm_sub_epi32(emm0, *_pi_0x7f); __m128 e = _mm_cvtepi32_ps(emm0); e = _mm_add_ps(e, one); __m128 mask = _mm_cmplt_ps(x, *_ps_cephes_SQRTHF); __m128 tmp = _mm_and_ps(x, mask); x = _mm_sub_ps(x, one); e = _mm_sub_ps(e, _mm_and_ps(one, mask)); x = _mm_add_ps(x, tmp); __m128 z = _mm_mul_ps(x, x); __m128 y = *_ps_cephes_log_p0; y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *_ps_cephes_log_p1); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *_ps_cephes_log_p2); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *_ps_cephes_log_p3); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *_ps_cephes_log_p4); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *_ps_cephes_log_p5); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *_ps_cephes_log_p6); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *_ps_cephes_log_p7); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *_ps_cephes_log_p8); y = _mm_mul_ps(y, x); y = _mm_mul_ps(y, z); tmp = _mm_mul_ps(e, *_ps_cephes_log_q1); y = _mm_add_ps(y, tmp); tmp = _mm_mul_ps(z, *_ps_0p5); y = _mm_sub_ps(y, tmp); tmp = _mm_mul_ps(e, *_ps_cephes_log_q2); x = _mm_add_ps(x, y); x = _mm_add_ps(x, tmp); x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN return x; }
float dot_product(const int N, const float *X, const int incX, const float *Y, const int incY) { __m256 accum = _mm256_setzero_ps(); for (int i = 0; i < N; i += 8, X += 8 * incX, Y += 8 * incY) { __m256 xval = _mm256_load_ps(X); __m256 yval = _mm256_load_ps(Y); __m256 val = _mm256_mul_ps(xval, yval); accum = _mm256_add_ps(val, accum); } // Reduce the values in accum into the smallest 32-bit subsection // a0 a1 a2 a3 a4 a5 a6 a7 -> b0 b1 b2 b3 __m128 accum2 = _mm_add_ps(_mm256_castps256_ps128(accum), _mm256_extractf128_ps(accum, 1)); // b0 b1 b2 b3 -> c0 c1 b2 b3 accum2 = _mm_add_ps(accum2, _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(accum2), 8))); __m128 final_val = _mm_add_ss( _mm_insert_ps(accum2, accum2, 0x4e), accum2); // Add the high and low halves return final_val[0]; }
static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width) { __m128i xmm0, xmmY1, xmmY2; __m128 xmmY; while (width >= 2) { xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf++)), _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf++))); xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++)); xmmY1 = _mm_adds_epi16(xmmY1, xmm0); xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++)); xmmY2 = _mm_adds_epi16(xmmY2, xmm0); xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), 0x44); xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); rgb_buf += 8; width -= 2; } if (width) { xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf)), _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf))); xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf)); xmmY1 = _mm_adds_epi16(xmmY1, xmm0); xmmY1 = _mm_srai_epi16(xmmY1, 6); xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); } }
inline FORCE_INLINE __m128 mm_cvtph_ps(__m128i x) { __m128 magic = _mm_castsi128_ps(_mm_set1_epi32((uint32_t)113 << 23)); __m128i shift_exp = _mm_set1_epi32(0x7C00UL << 13); __m128i sign_mask = _mm_set1_epi32(0x8000U); __m128i mant_mask = _mm_set1_epi32(0x7FFF); __m128i exp_adjust = _mm_set1_epi32((127UL - 15UL) << 23); __m128i exp_adjust_nan = _mm_set1_epi32((127UL - 16UL) << 23); __m128i exp_adjust_denorm = _mm_set1_epi32(1UL << 23); __m128i zero = _mm_set1_epi16(0); __m128i exp, ret, ret_nan, ret_denorm, sign, mask0, mask1; x = _mm_unpacklo_epi16(x, zero); ret = _mm_and_si128(x, mant_mask); ret = _mm_slli_epi32(ret, 13); exp = _mm_and_si128(shift_exp, ret); ret = _mm_add_epi32(ret, exp_adjust); mask0 = _mm_cmpeq_epi32(exp, shift_exp); mask1 = _mm_cmpeq_epi32(exp, zero); ret_nan = _mm_add_epi32(ret, exp_adjust_nan); ret_denorm = _mm_add_epi32(ret, exp_adjust_denorm); ret_denorm = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(ret_denorm), magic)); sign = _mm_and_si128(x, sign_mask); sign = _mm_slli_epi32(sign, 16); ret = mm_blendv_ps(ret_nan, ret, mask0); ret = mm_blendv_ps(ret_denorm, ret, mask1); ret = _mm_or_si128(ret, sign); return _mm_castsi128_ps(ret); }
/* merge "s+s" elements and return sorted result in "dest" array TODO(d'b): replace magic numbers with macro */ inline void bitonic_merge_kernel16n(float *dest, float *a, uint32_t sa, float *b /* must not be reversed*/, uint32_t sb) { __m128 ma[4]; __m128 mb[4]; __m128 lo[4]; __m128 hi[4]; #define LOAD16(arg) \ mb[3] = _mm_load_ps(arg); \ mb[2] = _mm_load_ps(arg + 4); \ mb[1] = _mm_load_ps(arg + 8); \ mb[0] = _mm_load_ps(arg + 12); arg+=16 float *last_a = a + sa; float *last_b = b + sb; float *last_dest = dest + sa + sb; ma[0] = _mm_load_ps(a); a+=4; ma[1] = _mm_load_ps(a); a+=4; ma[2] = _mm_load_ps(a); a+=4; ma[3] = _mm_load_ps(a); a+=4; for(; dest < (last_dest - 16); dest += 16) { /* Load either a or b */ if(a < last_a) { if(b < last_b) { if(*((uint32_t*)a) < *((uint32_t*)b)) { LOAD16(a); } else { LOAD16(b); } } else { LOAD16(a); } } else { LOAD16(b); } /* Reverse *b */ mb[0] = _mm_shuffle_ps(mb[0], mb[0], 0x1b); mb[1] = _mm_shuffle_ps(mb[1], mb[1], 0x1b); mb[2] = _mm_shuffle_ps(mb[2], mb[2], 0x1b); mb[3] = _mm_shuffle_ps(mb[3], mb[3], 0x1b); lo[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(ma[0]), _mm_castps_si128(mb[0]))); hi[0] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(ma[0]), _mm_castps_si128(mb[0]))); lo[1] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(ma[1]), _mm_castps_si128(mb[1]))); hi[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(ma[1]), _mm_castps_si128(mb[1]))); lo[2] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(ma[2]), _mm_castps_si128(mb[2]))); hi[2] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(ma[2]), _mm_castps_si128(mb[2]))); lo[3] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(ma[3]), _mm_castps_si128(mb[3]))); hi[3] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(ma[3]), _mm_castps_si128(mb[3]))); _mm_store_ps(&dest[0], lo[0]); _mm_store_ps(&dest[4], lo[1]); _mm_store_ps(&dest[8], lo[2]); _mm_store_ps(&dest[12], lo[3]); _mm_store_ps(&dest[16], hi[2]); _mm_store_ps(&dest[20], hi[3]); _mm_store_ps(&dest[24], hi[0]); _mm_store_ps(&dest[28], hi[1]); bitonic_merge_kernel8core(dest, dest + 8); bitonic_merge_kernel8core(dest + 16, dest + 24); ma[0] = _mm_load_ps(&dest[16]); ma[1] = _mm_load_ps(&dest[20]); ma[2] = _mm_load_ps(&dest[24]); ma[3] = _mm_load_ps(&dest[28]); } }
static void cftmdl_128_SSE2(float* a) { const int l = 8; const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); int j0; __m128 wk1rv = _mm_load_ps(cftmdl_wk1r); for (j0 = 0; j0 < l; j0 += 2) { const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32), _MM_SHUFFLE(1, 0, 1, 0)); const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40), _MM_SHUFFLE(1, 0, 1, 0)); __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48), _MM_SHUFFLE(1, 0, 1, 0)); const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56), _MM_SHUFFLE(1, 0, 1, 0)); const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1))); const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped); const __m128 yy0 = _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(2, 2, 2, 2)); const __m128 yy1 = _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(3, 3, 3, 3)); const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1); const __m128 yy3 = _mm_add_ps(yy0, yy2); const __m128 yy4 = _mm_mul_ps(wk1rv, yy3); _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx0)); _mm_storel_epi64( (__m128i*)&a[j0 + 32], _mm_shuffle_epi32(_mm_castps_si128(xx0), _MM_SHUFFLE(3, 2, 3, 2))); _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx1)); _mm_storel_epi64( (__m128i*)&a[j0 + 48], _mm_shuffle_epi32(_mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 2, 3))); a[j0 + 48] = -a[j0 + 48]; _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(x1_x3_add)); _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(x1_x3_sub)); _mm_storel_epi64((__m128i*)&a[j0 + 40], _mm_castps_si128(yy4)); _mm_storel_epi64( (__m128i*)&a[j0 + 56], _mm_shuffle_epi32(_mm_castps_si128(yy4), _MM_SHUFFLE(2, 3, 2, 3))); } { int k = 64; int k1 = 2; int k2 = 2 * k1; const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2 + 0]); const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]); const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]); const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]); const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]); wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]); for (j0 = k; j0 < l + k; j0 += 2) { const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32), _MM_SHUFFLE(1, 0, 1, 0)); const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40), _MM_SHUFFLE(1, 0, 1, 0)); __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48), _MM_SHUFFLE(1, 0, 1, 0)); const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56), _MM_SHUFFLE(1, 0, 1, 0)); const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const __m128 xx2 = _mm_mul_ps(xx1, wk2rv); const __m128 xx3 = _mm_mul_ps(wk2iv, _mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 0, 1)))); const __m128 xx4 = _mm_add_ps(xx2, xx3); const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1))); const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped); const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv); const __m128 xx11 = _mm_mul_ps( wk1iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add), _MM_SHUFFLE(2, 3, 0, 1)))); const __m128 xx12 = _mm_add_ps(xx10, xx11); const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv); const __m128 xx21 = _mm_mul_ps( wk3iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub), _MM_SHUFFLE(2, 3, 0, 1)))); const __m128 xx22 = _mm_add_ps(xx20, xx21); _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx)); _mm_storel_epi64( (__m128i*)&a[j0 + 32], _mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2))); _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4)); _mm_storel_epi64( (__m128i*)&a[j0 + 48], _mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2))); _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12)); _mm_storel_epi64( (__m128i*)&a[j0 + 40], _mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2))); _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22)); _mm_storel_epi64( (__m128i*)&a[j0 + 56], _mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2))); } } }
void BrushToolEdit::drawInner(const QPoint &pt, float strength) { float fixedStrength = params.strength; strength *= fixedStrength; auto color = params.color; std::array<int, 3> colorParts = Terrain::expandColor(color); __m128 colorMM = _mm_setr_ps(colorParts[0], colorParts[1], colorParts[2], 0); SseRoundingModeScope roundingModeScope(_MM_ROUND_NEAREST); (void) roundingModeScope; switch (tool->type()) { case BrushType::Blur: drawBlur(pt, std::min(strength / 5.f, 4.f)); break; case BrushType::Smoothen: drawSmoothen(pt, std::min(strength / 5.f, 4.f)); break; case BrushType::Raise: case BrushType::Lower: if (tool->type() == BrushType::Lower) { fixedStrength = -fixedStrength; strength = -strength; } switch (params.pressureMode) { case BrushPressureMode::AirBrush: strength *= 3.f; drawRaiseLower(pt, [=](float ¤t, float before, float tip) { (void) before; current -= tip * strength; }); break; case BrushPressureMode::Constant: if (tool->type() == BrushType::Lower) { drawRaiseLower(pt, [=](float ¤t, float before, float tip) { current = Terrain::quantizeOne(std::max(current, before - tip * fixedStrength)); }); } else { drawRaiseLower(pt, [=](float ¤t, float before, float tip) { current = Terrain::quantizeOne(std::min(current, before - tip * fixedStrength)); }); } break; case BrushPressureMode::Adjustable: drawRaiseLower(pt, [=](float ¤t, float before, float tip) { current = Terrain::quantizeOne(before - tip * strength); }); break; } break; case BrushType::Paint: switch (params.pressureMode) { case BrushPressureMode::AirBrush: strength = 1.f - std::exp2(-strength); drawColor(pt, [=](quint32 ¤t, quint32 before, float tip) { (void) before; // convert current color to FP32 auto currentMM = _mm_castps_si128(_mm_load_ss(reinterpret_cast<float *>(¤t))); currentMM = _mm_unpacklo_epi8(currentMM, _mm_setzero_si128()); currentMM = _mm_unpacklo_epi16(currentMM, _mm_setzero_si128()); auto currentMF = _mm_cvtepi32_ps(currentMM); auto factor = _mm_set1_ps(tip * strength); // blend auto diff = _mm_sub_ps(colorMM, currentMF); diff = _mm_mul_ps(diff, factor); currentMF = _mm_add_ps(currentMF, diff); // convert to RGB32 currentMF = _mm_add_ps(currentMF, globalDitherSampler.getM128()); currentMM = _mm_cvttps_epi32(currentMF); currentMM = _mm_packs_epi32(currentMM, currentMM); currentMM = _mm_packus_epi16(currentMM, currentMM); _mm_store_ss(reinterpret_cast<float *>(¤t), _mm_castsi128_ps(currentMM)); }); break; case BrushPressureMode::Constant: fixedStrength *= 0.01f; drawColor(pt, [=](quint32 ¤t, quint32 before, float tip) { // convert current color to FP32 auto currentMM = _mm_castps_si128(_mm_load_ss(reinterpret_cast<float *>(¤t))); currentMM = _mm_unpacklo_epi8(currentMM, _mm_setzero_si128()); currentMM = _mm_unpacklo_epi16(currentMM, _mm_setzero_si128()); auto currentMF = _mm_cvtepi32_ps(currentMM); // convert before color to FP32 auto beforeMM = _mm_setr_epi32(before, 0, 0, 0); beforeMM = _mm_unpacklo_epi8(beforeMM, _mm_setzero_si128()); beforeMM = _mm_unpacklo_epi16(beforeMM, _mm_setzero_si128()); auto beforeMF = _mm_cvtepi32_ps(beforeMM); // beforeMM = _mm_add_ps(beforeMM, globalDitherSampler.getM128()); // use "before" image to which way of color change is possible, and // compute possible range of result color auto diff = _mm_sub_ps(colorMM, beforeMF); auto factor = _mm_set1_ps(tip * fixedStrength); auto adddiff = _mm_mul_ps(diff, factor); beforeMF = _mm_add_ps(beforeMF, adddiff); auto diffDir = _mm_cmpgt_ps(diff, _mm_setzero_ps()); // compute output image auto out1 = _mm_max_ps(currentMF, beforeMF); auto out2 = _mm_min_ps(currentMF, beforeMF); currentMF = _mm_or_ps(_mm_and_ps(diffDir, out1), _mm_andnot_ps(diffDir, out2)); // convert to RGB32 currentMF = _mm_add_ps(currentMF, globalDitherSampler.getM128()); currentMM = _mm_cvttps_epi32(currentMF); currentMM = _mm_packs_epi32(currentMM, currentMM); currentMM = _mm_packus_epi16(currentMM, currentMM); _mm_store_ss(reinterpret_cast<float *>(¤t), _mm_castsi128_ps(currentMM)); }); break; case BrushPressureMode::Adjustable: strength *= 0.01f; drawColor(pt, [=](quint32 ¤t, quint32 before, float tip) { // convert before color to FP32 auto beforeMM = _mm_setr_epi32(before, 0, 0, 0); beforeMM = _mm_unpacklo_epi8(beforeMM, _mm_setzero_si128()); beforeMM = _mm_unpacklo_epi16(beforeMM, _mm_setzero_si128()); auto beforeMF = _mm_cvtepi32_ps(beforeMM); // blend auto diff = _mm_sub_ps(colorMM, beforeMF); auto factor = _mm_set1_ps(tip * strength); diff = _mm_mul_ps(diff, factor); beforeMF = _mm_add_ps(beforeMF, diff); // convert to RGB32 beforeMF = _mm_add_ps(beforeMF, globalDitherSampler.getM128()); beforeMM = _mm_cvttps_epi32(beforeMF); beforeMM = _mm_packs_epi32(beforeMM, beforeMM); beforeMM = _mm_packus_epi16(beforeMM, beforeMM); _mm_store_ss(reinterpret_cast<float *>(¤t), _mm_castsi128_ps(beforeMM)); }); break; } break; } }
const ALfloat *Resample_lerp32_SSE2(const BsincState* UNUSED(state), const ALfloat *src, ALuint frac, ALuint increment, ALfloat *restrict dst, ALuint numsamples) { const __m128i increment4 = _mm_set1_epi32(increment*4); const __m128 fracOne4 = _mm_set1_ps(1.0f/FRACTIONONE); const __m128i fracMask4 = _mm_set1_epi32(FRACTIONMASK); alignas(16) union { ALuint i[4]; float f[4]; } pos_; alignas(16) union { ALuint i[4]; float f[4]; } frac_; __m128i frac4, pos4; ALuint pos; ALuint i; InitiatePositionArrays(frac, increment, frac_.i, pos_.i, 4); frac4 = _mm_castps_si128(_mm_load_ps(frac_.f)); pos4 = _mm_castps_si128(_mm_load_ps(pos_.f)); for(i = 0;numsamples-i > 3;i += 4) { const __m128 val1 = _mm_setr_ps(src[pos_.i[0]], src[pos_.i[1]], src[pos_.i[2]], src[pos_.i[3]]); const __m128 val2 = _mm_setr_ps(src[pos_.i[0]+1], src[pos_.i[1]+1], src[pos_.i[2]+1], src[pos_.i[3]+1]); /* val1 + (val2-val1)*mu */ const __m128 r0 = _mm_sub_ps(val2, val1); const __m128 mu = _mm_mul_ps(_mm_cvtepi32_ps(frac4), fracOne4); const __m128 out = _mm_add_ps(val1, _mm_mul_ps(mu, r0)); _mm_store_ps(&dst[i], out); frac4 = _mm_add_epi32(frac4, increment4);
RETi CAST(const __m128 x) { return _mm_castps_si128(x); }
double bst_compute_123_m128_unaligned8_maskstore( void*_bst_obj, double* p, double* q, size_t nn ) { segments_t* mem = (segments_t*) _bst_obj; int n, i, r, l_end, j, l_end_pre; double t, e_tmp; double* e = mem->e, *w = mem->w; int* root = mem->r; __m128d v_tmp; __m128d v00, v01, v02, v03; __m128d v10, v11, v12, v13; __m128d v20, v21, v22, v23; __m128d v30, v31, v32, v33; __m128i v_cur_roots; __m128 v_rootmask0, v_rootmask1; // initialization // mem->n = nn; n = nn; // subtractions with n potentially negative. say hello to all the bugs int idx1, idx2, idx3; idx1 = IDX(n,n); e[idx1] = q[n]; idx1++; for (i = n-1; i >= 0; --i) { idx1 -= 2*(n-i)+1; idx2 = idx1 + 1; e[idx1] = q[i]; w[idx1] = q[i]; for (j = i+1; j < n+1; ++j,++idx2) { e[idx2] = INFINITY; w[idx2] = w[idx2-1] + p[j-1] + q[j]; } idx3 = idx1; for (r = i; r < n; ++r) { // idx2 = IDX(r+1, r+1); idx1 = idx3; l_end = idx2 + (n-r); // l_end points to the first entry after the current row e_tmp = e[idx1++]; // calculate until a multiple of 8 doubles is left // 8 = 4 * 2 128-bit vectors l_end_pre = idx2 + ((n-r)&7); for( ; (idx2 < l_end_pre) && (idx2 < l_end); ++idx2 ) { t = e_tmp + e[idx2] + w[idx1]; if (t < e[idx1]) { e[idx1] = t; root[idx1] = r; } idx1++; } v_tmp = _mm_set_pd( e_tmp, e_tmp ); // execute the shit for 4 vectors of size 2 v_cur_roots = _mm_set_epi32(r, r, r, r); for( ; idx2 < l_end; idx2 += 8 ) { v01 = _mm_loadu_pd( &w[idx1 ] ); v11 = _mm_loadu_pd( &w[idx1+2] ); v21 = _mm_loadu_pd( &w[idx1+4] ); v31 = _mm_loadu_pd( &w[idx1+6] ); v00 = _mm_loadu_pd( &e[idx2 ] ); v01 = _mm_add_pd( v01, v_tmp ); v10 = _mm_loadu_pd( &e[idx2+2] ); v11 = _mm_add_pd( v11, v_tmp ); v20 = _mm_loadu_pd( &e[idx2+4] ); v21 = _mm_add_pd( v21, v_tmp ); v30 = _mm_loadu_pd( &e[idx2+6] ); v31 = _mm_add_pd( v31, v_tmp ); v01 = _mm_add_pd( v01, v00 ); v03 = _mm_loadu_pd( &e[idx1 ] ); v11 = _mm_add_pd( v11, v10 ); v13 = _mm_loadu_pd( &e[idx1+2] ); v21 = _mm_add_pd( v21, v20 ); v23 = _mm_loadu_pd( &e[idx1+4] ); v31 = _mm_add_pd( v31, v30 ); v33 = _mm_loadu_pd( &e[idx1+6] ); v02 = _mm_cmplt_pd( v01, v03 ); v12 = _mm_cmplt_pd( v11, v13 ); v22 = _mm_cmplt_pd( v21, v23 ); v32 = _mm_cmplt_pd( v31, v33 ); _mm_maskstore_pd( &e[idx1 ], _mm_castpd_si128( v02 ), v01 ); _mm_maskstore_pd( &e[idx1+2], _mm_castpd_si128( v12 ), v11 ); _mm_maskstore_pd( &e[idx1+4], _mm_castpd_si128( v22 ), v21 ); _mm_maskstore_pd( &e[idx1+6], _mm_castpd_si128( v32 ), v31 ); v_rootmask0 = _mm_shuffle_ps( _mm_castpd_ps( v02 ), _mm_castpd_ps( v12 ), _MM_SHUFFLE(0,2,0,2) ); v_rootmask1 = _mm_shuffle_ps( _mm_castpd_ps( v12 ), _mm_castpd_ps( v22 ), _MM_SHUFFLE(0,2,0,2) ); _mm_maskstore_ps( &root[idx1], _mm_castps_si128( v_rootmask0 ), _mm_castsi128_ps( v_cur_roots ) ); _mm_maskstore_ps( &root[idx1+4], _mm_castps_si128( v_rootmask1 ), _mm_castsi128_ps( v_cur_roots ) ); idx1 += 8; } idx3++; } } return e[IDX(0,n)]; }
void BM3D_Basic_Process::CollaborativeFilter(int plane, FLType *ResNum, FLType *ResDen, const FLType *src, const FLType *ref, const PosPairCode &code) const { PCType GroupSize = static_cast<PCType>(code.size()); // When para.GroupSize > 0, limit GroupSize up to para.GroupSize if (d.para.GroupSize > 0 && GroupSize > d.para.GroupSize) { GroupSize = d.para.GroupSize; } // Construct source group guided by matched pos code block_group srcGroup(src, src_stride[plane], code, GroupSize, d.para.BlockSize, d.para.BlockSize); // Initialize retianed coefficients of hard threshold filtering int retainedCoefs = 0; // Apply forward 3D transform to the source group d.f[plane].fp[GroupSize - 1].execute_r2r(srcGroup.data(), srcGroup.data()); // Apply hard-thresholding to the source group auto srcp = srcGroup.data(); auto thrp = d.f[plane].thrTable[GroupSize - 1].get(); const auto upper = srcp + srcGroup.size(); #if defined(__SSE2__) static const ptrdiff_t simd_step = 4; const ptrdiff_t simd_residue = srcGroup.size() % simd_step; const ptrdiff_t simd_width = srcGroup.size() - simd_residue; static const __m128 zero_ps = _mm_setzero_ps(); __m128i cmp_sum = _mm_setzero_si128(); for (const auto upper1 = srcp + simd_width; srcp < upper1; srcp += simd_step, thrp += simd_step) { const __m128 s1 = _mm_load_ps(srcp); const __m128 t1p = _mm_load_ps(thrp); const __m128 t1n = _mm_sub_ps(zero_ps, t1p); const __m128 cmp1 = _mm_cmpgt_ps(s1, t1p); const __m128 cmp2 = _mm_cmplt_ps(s1, t1n); const __m128 cmp = _mm_or_ps(cmp1, cmp2); const __m128 d1 = _mm_and_ps(cmp, s1); _mm_store_ps(srcp, d1); cmp_sum = _mm_sub_epi32(cmp_sum, _mm_castps_si128(cmp)); } alignas(16) int32_t cmp_sum_i32[4]; _mm_store_si128(reinterpret_cast<__m128i *>(cmp_sum_i32), cmp_sum); retainedCoefs += cmp_sum_i32[0] + cmp_sum_i32[1] + cmp_sum_i32[2] + cmp_sum_i32[3]; #endif for (; srcp < upper; ++srcp, ++thrp) { if (*srcp > *thrp || *srcp < -*thrp) { ++retainedCoefs; } else { *srcp = 0; } } // Apply backward 3D transform to the filtered group d.f[plane].bp[GroupSize - 1].execute_r2r(srcGroup.data(), srcGroup.data()); // Calculate weight for the filtered group // Also include the normalization factor to compensate for the amplification introduced in 3D transform FLType denWeight = retainedCoefs < 1 ? 1 : FLType(1) / static_cast<FLType>(retainedCoefs); FLType numWeight = static_cast<FLType>(denWeight / d.f[plane].finalAMP[GroupSize - 1]); // Store the weighted filtered group to the numerator part of the basic estimation // Store the weight to the denominator part of the basic estimation srcGroup.AddTo(ResNum, dst_stride[plane], numWeight); srcGroup.CountTo(ResDen, dst_stride[plane], denWeight); }
_declspec(dllexport) DiffResult __stdcall diff_img(Image left, Image right, DiffOptions options) { if (options.ignoreColor) { makeGreyscale(left); makeGreyscale(right); } float* imgMem = (float*)_aligned_malloc(left.width * left.height * sizeof(float) * 4, 16); int colorOffset = left.width * left.height; Image diff = { left.width, left.height, left.stride, imgMem, imgMem + colorOffset, imgMem + colorOffset * 2, imgMem + colorOffset * 3 }; float* drp = diff.r; float* dgp = diff.g; float* dbp = diff.b; float* dap = diff.a; float* lrp = left.r; float* lgp = left.g; float* lbp = left.b; float* lap = left.a; float* rrp = right.r; float* rgp = right.g; float* rbp = right.b; float* rap = right.a; Color error = ConvertToFloat(options.errorColor); auto er = _mm_set_ps1(error.r); auto eg = _mm_set_ps1(error.g); auto eb = _mm_set_ps1(error.b); auto ea = _mm_set_ps1(error.a); auto tolerance = _mm_set_ps1(options.tolerance); auto overlayTransparency = _mm_set_ps1(options.overlayTransparency); OverlayType overlayType = options.overlayType; byte weightByDiffPercentage = options.weightByDiffPercentage; auto diffPixelCount = _mm_set_epi32(0, 0, 0, 0); auto onei = _mm_set1_epi32(1); auto one = _mm_set1_ps(1); auto zero = _mm_set1_ps(0); for (int y = 0; y < left.height; y++) { for (int x = 0; x < left.width; x+=4) { auto lr = _mm_load_ps(lrp); auto lg = _mm_load_ps(lgp); auto lb = _mm_load_ps(lbp); auto la = _mm_load_ps(lap); auto rr = _mm_load_ps(rrp); auto rg = _mm_load_ps(rgp); auto rb = _mm_load_ps(rbp); auto ra = _mm_load_ps(rap); auto rdiff = _mm_sub_ps(rr, lr); auto gdiff = _mm_sub_ps(rg, lg); auto bdiff = _mm_sub_ps(rb, lb); auto adiff = _mm_sub_ps(ra, la); auto distance = _mm_mul_ps(rdiff, rdiff); distance = _mm_add_ps(distance, _mm_mul_ps(gdiff, gdiff)); distance = _mm_add_ps(distance, _mm_mul_ps(bdiff, bdiff)); distance = _mm_add_ps(distance, _mm_mul_ps(adiff, adiff)); distance = _mm_sqrt_ps(distance); auto t = overlayTransparency; if (weightByDiffPercentage) { t = _mm_mul_ps(t, distance); } auto isdiff = _mm_cmpgt_ps(distance, tolerance); t = _mm_min_ps(one, _mm_max_ps(zero, t)); auto mlr = rr; auto mlg = rg; auto mlb = rb; auto mla = ra; if (overlayType == OverlayType::Movement) { mlr = _mm_mul_ps(mlr, er); mlg = _mm_mul_ps(mlg, eg); mlb = _mm_mul_ps(mlb, eb); mla = _mm_mul_ps(mla, ea); } auto oneMinusT = _mm_sub_ps(one, t); auto mixedR = _mm_add_ps(_mm_mul_ps(mlr, oneMinusT), _mm_mul_ps(er, t)); auto mixedG = _mm_add_ps(_mm_mul_ps(mlg, oneMinusT), _mm_mul_ps(eg, t)); auto mixedB = _mm_add_ps(_mm_mul_ps(mlb, oneMinusT), _mm_mul_ps(eb, t)); auto mixedA = one; if (overlayType != OverlayType::Movement) { mixedA = _mm_add_ps(_mm_mul_ps(mla, oneMinusT), _mm_mul_ps(ea, t)); } // (((b ^ a) & mask)^a) auto dr = _mm_xor_ps(lr, _mm_and_ps(isdiff, _mm_xor_ps(mixedR, lr))); auto dg = _mm_xor_ps(lg, _mm_and_ps(isdiff, _mm_xor_ps(mixedG, lg))); auto db = _mm_xor_ps(lb, _mm_and_ps(isdiff, _mm_xor_ps(mixedB, lb))); auto da = _mm_xor_ps(la, _mm_and_ps(isdiff, _mm_xor_ps(mixedA, la))); diffPixelCount = _mm_xor_si128(diffPixelCount, _mm_and_si128(_mm_castps_si128(isdiff), _mm_xor_si128(_mm_add_epi32(diffPixelCount, onei), diffPixelCount))); _mm_store_ps(drp, dr); _mm_store_ps(dgp, dg); _mm_store_ps(dbp, db); _mm_store_ps(dap, da); drp+=4; dgp+=4; dbp+=4; dap+=4; lrp+=4; lgp+=4; lbp+=4; lap+=4; rrp+=4; rgp+=4; rbp+=4; rap+=4; } } int* pixelCounts = (int*)_aligned_malloc(4 * sizeof(int), 16); _mm_store_si128((__m128i*)pixelCounts, diffPixelCount); int totalCount = pixelCounts[0] + pixelCounts[1] + pixelCounts[2] + pixelCounts[3]; _aligned_free(pixelCounts); return{ diff, 1.0f - float(totalCount) / (left.height * left.width - left.height * left.stride) }; }
static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width, int source_dx) { __m128i xmm0, xmmY1, xmmY2; __m128 xmmY; uint8 u0, u1, v0, v1, y0, y1; uint32 uv_frac, y_frac, u, v, y; int x = 0; if (source_dx >= 0x20000) { x = 32768; } while(width >= 2) { u0 = u_buf[x >> 17]; u1 = u_buf[(x >> 17) + 1]; v0 = v_buf[x >> 17]; v1 = v_buf[(x >> 17) + 1]; y0 = y_buf[x >> 16]; y1 = y_buf[(x >> 16) + 1]; uv_frac = (x & 0x1fffe); y_frac = (x & 0xffff); u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17; v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17; y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; x += source_dx; xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); xmmY1 = _mm_adds_epi16(xmmY1, xmm0); y0 = y_buf[x >> 16]; y1 = y_buf[(x >> 16) + 1]; y_frac = (x & 0xffff); y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; x += source_dx; xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); xmmY2 = _mm_adds_epi16(xmmY2, xmm0); xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), 0x44); xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); rgb_buf += 8; width -= 2; } if (width) { u = u_buf[x >> 17]; v = v_buf[x >> 17]; y = y_buf[x >> 16]; xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); xmmY1 = _mm_adds_epi16(xmmY1, xmm0); xmmY1 = _mm_srai_epi16(xmmY1, 6); xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); } }