size_t sse4_strstr_unrolled_max20(const char* s, size_t n, const char* needle, size_t needle_size) {

    const __m128i zeros  = _mm_setzero_si128();
    const __m128i prefix = sse::load(needle);
    const __m128i suffix = sse::load(needle + 4);
    const __m128i suff_mask = sse::mask_lower_bytes(needle_size - 4);

    for (size_t i = 0; i < n; i += 8) {

        const __m128i data   = sse::load(s + i);
        const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);

        const __m128i cmp    = _mm_cmpeq_epi16(result, zeros);

        unsigned mask = _mm_movemask_epi8(cmp) & 0x5555;

        while (mask != 0) {

            const auto bitpos = bits::get_first_bit_set(mask)/2;

            const __m128i str = sse::load(s + i + bitpos + 4);
            const __m128i cmp = _mm_cmpeq_epi8(str, suffix);

            if (_mm_testc_si128(cmp, suff_mask)) {

                return i + bitpos;

            mask = bits::clear_leftmost_set(mask);

    return std::string::npos;
size_t sse4_strstr_unrolled_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) {

    assert(k > 4);
    assert(n > 0);

    const __m128i prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needle));
    const __m128i zeros  = _mm_setzero_si128();

    __m128i prev = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));
    __m128i curr;

    for (size_t i = 0; i < n; i += 16) {

        curr  = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i + 16));

        const __m128i data0   = prev;
        const __m128i data1   = _mm_alignr_epi8(curr, prev, 8);
        const __m128i result0 = _mm_mpsadbw_epu8(data0, prefix, 0);
        const __m128i result1 = _mm_mpsadbw_epu8(data1, prefix, 0);
        prev = curr;

        const __m128i result  = _mm_packus_epi16(result0, result1);
        const __m128i cmp     = _mm_cmpeq_epi8(result, zeros);

        unsigned mask = _mm_movemask_epi8(cmp);

        while (mask != 0) {

            const auto bitpos = bits::get_first_bit_set(mask);

            if (memcmp_fun(s + i + bitpos + 4, needle + 4)) {
                return i + bitpos;

            mask = bits::clear_leftmost_set(mask);

    return std::string::npos;
Exemplo n.º 3
uint32_t compute_SAD8_SSE4(
	const uint8_t *img_data_L,
	const uint8_t *img_data_R,
	const size_t img_stride,
	const mrpt::utils::TPixelCoord &pt_L,
	const mrpt::utils::TPixelCoord &pt_R)
	const uint8_t *ptrL = img_data_L+img_stride*(pt_L.y-3) + (pt_L.x-3);
	const uint8_t *ptrR = img_data_R+img_stride*(pt_R.y-3) + (pt_R.x-3);

	// Refer to the documentation of _mm_mpsadbw_epu8() for details
	// See also:
	const int mask_00 = 0x00;   // SAD of bytes 3:0 of both L&R images
	const int mask_44 = 0x05;   // SAD of bytes 7:4 of both L&R images

	int16_t total_SAD=0;
	for (int y=0;y<8;y++)
		// Load 8 pixels from each image:
		const __m128i imgL = _mm_loadu_si128((const __m128i*)ptrL); // "u" allows 16-unaligned ptrs 
		const __m128i imgR = _mm_loadu_si128((const __m128i*)ptrR); // "u" allows 16-unaligned ptrs 

		// We'll only use the lowest 16bit sum (we are wasting a lot of potential of this instruction!!)
		const __m128i sad00 = _mm_mpsadbw_epu8(imgL,imgR, mask_00);
		const __m128i sad44 = _mm_mpsadbw_epu8(imgL,imgR, mask_44);
		total_SAD+= sad00.m128i_i16[0]+sad44.m128i_i16[0];

	return total_SAD;
	return 0; // shouldn't ever reach this anyway
size_t sse4_strstr_unrolled_len4(const char* s, size_t n, const char* needle) {

    const __m128i prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needle));
    const __m128i zeros  = _mm_setzero_si128();

    for (size_t i = 0; i < n; i += 8) {

        const __m128i data   = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));
        const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);

        const __m128i cmp    = _mm_cmpeq_epi16(result, zeros);

        unsigned mask = _mm_movemask_epi8(cmp);

        if (mask != 0) {

            return i + bits::get_first_bit_set(mask)/2;

    return std::string::npos;
size_t sse4_strstr_unrolled_max36(const char* s, size_t n, const char* needle, size_t needle_size) {

    const __m128i zeros     = _mm_setzero_si128();
    const __m128i prefix    = sse::load(needle);
    const __m128i suffix1   = sse::load(needle + 4);
    const __m128i suffix2   = sse::load(needle + 16 + 4);
    const __m128i suff_mask = sse::mask_higher_bytes(needle_size - (16 + 4));

    for (size_t i = 0; i < n; i += 8) {

        const __m128i data   = sse::load(s + i);
        const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);

        const __m128i cmp    = _mm_cmpeq_epi16(result, zeros);

        unsigned mask = _mm_movemask_epi8(cmp) & 0x5555;

        while (mask != 0) {

            const auto bitpos = bits::get_first_bit_set(mask)/2;

            const __m128i c1 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 4), suffix1);
            const __m128i c2 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 16 + 4), suffix2);

            const __m128i c3 = _mm_or_si128(c2, suff_mask);
            const __m128i tmp = _mm_and_si128(c1, c3);

            if (_mm_movemask_epi8(tmp) == 0xffff) {

                return i + bitpos;

            mask = bits::clear_leftmost_set(mask);

    return std::string::npos;
Exemplo n.º 6
test8bit (void)
  i1 = _mm_cmpistrm (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistri (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistra (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistrc (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistro (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistrs (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistrz (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  i1 = _mm_cmpestrm (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestri (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestra (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestrc (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestro (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestrs (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestrz (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  b1 = _mm256_blend_ps (b2, b3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  k1 = _cvtss_sh (f1, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm256_cvtps_ph (b2, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  b1 = _mm256_dp_ps (b2, b3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  e1 = _mm256_permute2f128_pd (e2, e3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */
  b1 = _mm256_permute2f128_ps (b2, b3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */
  l1 = _mm256_permute2f128_si256 (l2, l3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */
  b1 = _mm256_permute_ps (b2, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_aeskeygenassist_si128 (i2, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_blend_epi16 (i2, i3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_clmulepi64_si128 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_cvtps_ph (a1, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  d1 = _mm_dp_pd (d2, d3, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  a1 = _mm_dp_ps (a2, a3, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  a1 = _mm_insert_ps (a2, a3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_mpsadbw_epu8 (i2, i3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  a1 = _mm_permute_ps (a2, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_slli_si128 (i2, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_srli_si128 (i2, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
Exemplo n.º 7
__m128i test_mm_mpsadbw_epu8(__m128i x, __m128i y) {
  // CHECK: define {{.*}} @test_mm_mpsadbw_epu8
  // CHECK: @llvm.x86.sse41.mpsadbw
  return _mm_mpsadbw_epu8(x, y, 1);
Exemplo n.º 8
__m128i test_mm_mpsadbw_epu8(__m128i x, __m128i y) {
  // CHECK-LABEL: test_mm_mpsadbw_epu8
  // CHECK: call <8 x i16> @llvm.x86.sse41.mpsadbw
  // CHECK-ASM: mpsadbw $1, %xmm{{.*}}, %xmm{{.*}}
  return _mm_mpsadbw_epu8(x, y, 1);
Exemplo n.º 9
__m128i test_mm_mpsadbw_epu8(__m128i x, __m128i y) {
  // CHECK-LABEL: test_mm_mpsadbw_epu8
  // CHECK: call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8 1)
  return _mm_mpsadbw_epu8(x, y, 1);