size_t sse4_strstr_unrolled_max20(const char* s, size_t n, const char* needle, size_t needle_size) { const __m128i zeros = _mm_setzero_si128(); const __m128i prefix = sse::load(needle); const __m128i suffix = sse::load(needle + 4); const __m128i suff_mask = sse::mask_lower_bytes(needle_size - 4); for (size_t i = 0; i < n; i += 8) { const __m128i data = sse::load(s + i); const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); const __m128i cmp = _mm_cmpeq_epi16(result, zeros); unsigned mask = _mm_movemask_epi8(cmp) & 0x5555; while (mask != 0) { const auto bitpos = bits::get_first_bit_set(mask)/2; const __m128i str = sse::load(s + i + bitpos + 4); const __m128i cmp = _mm_cmpeq_epi8(str, suffix); if (_mm_testc_si128(cmp, suff_mask)) { return i + bitpos; } mask = bits::clear_leftmost_set(mask); } } return std::string::npos; }
size_t sse4_strstr_unrolled_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) { assert(k > 4); assert(n > 0); const __m128i prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needle)); const __m128i zeros = _mm_setzero_si128(); __m128i prev = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s)); __m128i curr; for (size_t i = 0; i < n; i += 16) { curr = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i + 16)); const __m128i data0 = prev; const __m128i data1 = _mm_alignr_epi8(curr, prev, 8); const __m128i result0 = _mm_mpsadbw_epu8(data0, prefix, 0); const __m128i result1 = _mm_mpsadbw_epu8(data1, prefix, 0); prev = curr; const __m128i result = _mm_packus_epi16(result0, result1); const __m128i cmp = _mm_cmpeq_epi8(result, zeros); unsigned mask = _mm_movemask_epi8(cmp); while (mask != 0) { const auto bitpos = bits::get_first_bit_set(mask); if (memcmp_fun(s + i + bitpos + 4, needle + 4)) { return i + bitpos; } mask = bits::clear_leftmost_set(mask); } } return std::string::npos; }
uint32_t compute_SAD8_SSE4( const uint8_t *img_data_L, const uint8_t *img_data_R, const size_t img_stride, const mrpt::utils::TPixelCoord &pt_L, const mrpt::utils::TPixelCoord &pt_R) { #if RSO_HAS_SSE4 const uint8_t *ptrL = img_data_L+img_stride*(pt_L.y-3) + (pt_L.x-3); const uint8_t *ptrR = img_data_R+img_stride*(pt_R.y-3) + (pt_R.x-3); // Refer to the documentation of _mm_mpsadbw_epu8() for details // See also: http://software.intel.com/en-us/articles/motion-estimation-with-intel-streaming-simd-extensions-4-intel-sse4/ const int mask_00 = 0x00; // SAD of bytes 3:0 of both L&R images const int mask_44 = 0x05; // SAD of bytes 7:4 of both L&R images int16_t total_SAD=0; for (int y=0;y<8;y++) { // Load 8 pixels from each image: const __m128i imgL = _mm_loadu_si128((const __m128i*)ptrL); // "u" allows 16-unaligned ptrs const __m128i imgR = _mm_loadu_si128((const __m128i*)ptrR); // "u" allows 16-unaligned ptrs // We'll only use the lowest 16bit sum (we are wasting a lot of potential of this instruction!!) const __m128i sad00 = _mm_mpsadbw_epu8(imgL,imgR, mask_00); const __m128i sad44 = _mm_mpsadbw_epu8(imgL,imgR, mask_44); total_SAD+= sad00.m128i_i16[0]+sad44.m128i_i16[0]; ptrL+=img_stride; ptrR+=img_stride; } return total_SAD; #else return 0; // shouldn't ever reach this anyway #endif }
size_t sse4_strstr_unrolled_len4(const char* s, size_t n, const char* needle) { const __m128i prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needle)); const __m128i zeros = _mm_setzero_si128(); for (size_t i = 0; i < n; i += 8) { const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i)); const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); const __m128i cmp = _mm_cmpeq_epi16(result, zeros); unsigned mask = _mm_movemask_epi8(cmp); if (mask != 0) { return i + bits::get_first_bit_set(mask)/2; } } return std::string::npos; }
size_t sse4_strstr_unrolled_max36(const char* s, size_t n, const char* needle, size_t needle_size) { const __m128i zeros = _mm_setzero_si128(); const __m128i prefix = sse::load(needle); const __m128i suffix1 = sse::load(needle + 4); const __m128i suffix2 = sse::load(needle + 16 + 4); const __m128i suff_mask = sse::mask_higher_bytes(needle_size - (16 + 4)); for (size_t i = 0; i < n; i += 8) { const __m128i data = sse::load(s + i); const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); const __m128i cmp = _mm_cmpeq_epi16(result, zeros); unsigned mask = _mm_movemask_epi8(cmp) & 0x5555; while (mask != 0) { const auto bitpos = bits::get_first_bit_set(mask)/2; const __m128i c1 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 4), suffix1); const __m128i c2 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 16 + 4), suffix2); const __m128i c3 = _mm_or_si128(c2, suff_mask); const __m128i tmp = _mm_and_si128(c1, c3); if (_mm_movemask_epi8(tmp) == 0xffff) { return i + bitpos; } mask = bits::clear_leftmost_set(mask); } } return std::string::npos; }
void test8bit (void) { i1 = _mm_cmpistrm (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistri (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistra (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistrc (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistro (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistrs (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistrz (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ i1 = _mm_cmpestrm (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestri (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestra (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestrc (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestro (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestrs (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestrz (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ b1 = _mm256_blend_ps (b2, b3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ k1 = _cvtss_sh (f1, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm256_cvtps_ph (b2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ b1 = _mm256_dp_ps (b2, b3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ e1 = _mm256_permute2f128_pd (e2, e3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ b1 = _mm256_permute2f128_ps (b2, b3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ l1 = _mm256_permute2f128_si256 (l2, l3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ b1 = _mm256_permute_ps (b2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_aeskeygenassist_si128 (i2, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_blend_epi16 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_clmulepi64_si128 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_cvtps_ph (a1, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ d1 = _mm_dp_pd (d2, d3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ a1 = _mm_dp_ps (a2, a3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ a1 = _mm_insert_ps (a2, a3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_mpsadbw_epu8 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ a1 = _mm_permute_ps (a2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_slli_si128 (i2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_srli_si128 (i2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ }
__m128i test_mm_mpsadbw_epu8(__m128i x, __m128i y) { // CHECK: define {{.*}} @test_mm_mpsadbw_epu8 // CHECK: @llvm.x86.sse41.mpsadbw return _mm_mpsadbw_epu8(x, y, 1); }
__m128i test_mm_mpsadbw_epu8(__m128i x, __m128i y) { // CHECK-LABEL: test_mm_mpsadbw_epu8 // CHECK: call <8 x i16> @llvm.x86.sse41.mpsadbw // CHECK-ASM: mpsadbw $1, %xmm{{.*}}, %xmm{{.*}} return _mm_mpsadbw_epu8(x, y, 1); }
__m128i test_mm_mpsadbw_epu8(__m128i x, __m128i y) { // CHECK-LABEL: test_mm_mpsadbw_epu8 // CHECK: call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8 1) return _mm_mpsadbw_epu8(x, y, 1); }