static void sse4_2_test (void) { union { __m128i x[NUM]; char c[NUM *16]; } src1, src2; int res, correct, l1, l2; int i; for (i = 0; i < NUM *16; i++) { src1.c[i] = rand (); src2.c[i] = rand (); } for (i = 0; i < NUM; i++) { l1 = rand () % 18; l2 = rand () % 18; switch ((rand () % 4)) { case 0: res = _mm_cmpestri (src1.x[i], l1, src2.x[i], l2, IMM_VAL0); correct = cmp_ei (&src1.x[i], l1, &src2.x[i], l2, IMM_VAL0, NULL); break; case 1: res = _mm_cmpestri (src1.x[i], l1, src2.x[i], l2, IMM_VAL1); correct = cmp_ei (&src1.x[i], l1, &src2.x[i], l2, IMM_VAL1, NULL); break; case 2: res = _mm_cmpestri (src1.x[i], l1, src2.x[i], l2, IMM_VAL2); correct = cmp_ei (&src1.x[i], l1, &src2.x[i], l2, IMM_VAL2, NULL); break; default: res = _mm_cmpestri (src1.x[i], l1, src2.x[i], l2, IMM_VAL3); correct = cmp_ei (&src1.x[i], l1, &src2.x[i], l2, IMM_VAL3, NULL); break; } if (correct != res) abort (); } }
static const char *findchar_fast(const char *buf, const char *buf_end, const char *ranges, size_t ranges_size, int *found) { *found = 0; #if __SSE4_2__ if (likely(buf_end - buf >= 16)) { __m128i ranges16 = _mm_loadu_si128((const __m128i *)ranges); size_t left = (buf_end - buf) & ~15; do { __m128i b16 = _mm_loadu_si128((const __m128i *)buf); int r = _mm_cmpestri(ranges16, ranges_size, b16, 16, _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES | _SIDD_UBYTE_OPS); if (unlikely(r != 16)) { buf += r; *found = 1; break; } buf += 16; left -= 16; } while (likely(left != 0)); } #else /* suppress unused parameter warning */ (void)buf_end; (void)ranges; (void)ranges_size; #endif return buf; }
size_t scanHaystackBlock(const StringPieceLite haystack, const StringPieceLite needles, uint64_t blockStartIdx) { DCHECK_GT(needles.size(), 16u); // should handled by *needles16() method DCHECK(blockStartIdx + 16 <= haystack.size() || (page_for(haystack.data() + blockStartIdx) == page_for(haystack.data() + blockStartIdx + 15))); __m128i arr1; if (HAYSTACK_ALIGNED) { arr1 = _mm_load_si128( reinterpret_cast<const __m128i*>(haystack.data() + blockStartIdx)); } else { arr1 = _mm_loadu_si128( reinterpret_cast<const __m128i*>(haystack.data() + blockStartIdx)); } // This load is safe because needles.size() >= 16 auto arr2 = _mm_loadu_si128( reinterpret_cast<const __m128i*>(needles.data())); size_t b = _mm_cmpestri(arr2, 16, arr1, int(haystack.size() - blockStartIdx), 0); size_t j = nextAlignedIndex(needles.data()); for (; j < needles.size(); j += 16) { arr2 = _mm_load_si128( reinterpret_cast<const __m128i*>(needles.data() + j)); auto index = _mm_cmpestri( arr2, int(needles.size() - j), arr1, int(haystack.size() - blockStartIdx), 0); b = std::min<size_t>(index, b); } if (b < 16) { return blockStartIdx + b; } return std::string::npos; }
void scanCharDataContentwithSTTNI(SAX2Processor* saxProcessor) { unsigned int length = yylim - yycur; unsigned char* data = (unsigned char*)yycur; if( *data == '<' || *data == '&' || *data == ']') return; unsigned int dataLen = 0; // initialize the one byte encoding rule and nonCharaData rule const __m128i asciiCharData = _mm_set_epi8(0,0,0,0,0,0,0x7F,0x5E,0x5C,0x3D, 0x3B,0x27,0x25,0x20,0,0); const __m128i nonCharData = _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0x5D,0x3C,0x26,0x0D,0x0A); do { // special new line processing for ‘x0A’,‘x0D’ if( *data == '\0' ) { saxProcessor->newLine((char*)data); data++; length--; } else if(*data == '\0') { saxProcessor->newLine((char*)data); if( *(data+1) == '\0' ) { data += 2; length -= 2; yycur++; } else { *data = '\0'; data++; length--; } } while( length > 0 ) { if( length >= 16 ) dataLen = 16; else dataLen = length; const __m128i mData = _mm_loadu_si128((__m128i*)data); // locate the Character Data part with the nonCharaData characters int index = _mm_cmpestri(nonCharData, 5, mData, dataLen, _SIDD_CMP_EQUAL_ANY); if( index == 0 ) break; if( index > dataLen ) index = dataLen; bool shouldBreak = index < dataLen ? true : false; // check the one byte encoding rule(ASCII) unsigned int mask = _mm_cvtsi128_si32(_mm_cmpestrm(asciiCharData, 10, mData, index, _SIDD_CMP_RANGES|_SIDD_MASKED_NEGATIVE_POLARITY)); // if not all hit ASCII, continue to check other Unicode rules if( mask == 0 || recogUnicodeRange(mData, index, ~mask)) { data += index; length -= index; if( shouldBreak ) break; } else { break; } } unsigned int passLen = (char*)data - yycur; if( passLen == 0 ) break; // report Character Data to user saxProcessor->reportCharDataContent(yycur, passLen); yycur += passLen; YYSWITCHBUFFER; } while( length >= STTNISTRLENLIMIT && (*data == '\0' || *data == '\0') ); }
// helper method for case where needles.size() <= 16 size_t qfind_first_byte_of_needles16(const StringPieceLite haystack, const StringPieceLite needles) { DCHECK_GT(haystack.size(), 0u); DCHECK_GT(needles.size(), 0u); DCHECK_LE(needles.size(), 16u); if ((needles.size() <= 2 && haystack.size() >= 256) || // must bail if we can't even SSE-load a single segment of haystack (haystack.size() < 16 && page_for(haystack.end() - 1) != page_for(haystack.data() + 15)) || // can't load needles into SSE register if it could cross page boundary page_for(needles.end() - 1) != page_for(needles.data() + 15)) { return detail::qfind_first_byte_of_nosse(haystack, needles); } auto arr2 = _mm_loadu_si128( reinterpret_cast<const __m128i*>(needles.data())); // do an unaligned load for first block of haystack auto arr1 = _mm_loadu_si128( reinterpret_cast<const __m128i*>(haystack.data())); auto index = _mm_cmpestri(arr2, int(needles.size()), arr1, int(haystack.size()), 0); if (index < 16) { return index; } // Now, we can do aligned loads hereafter... size_t i = nextAlignedIndex(haystack.data()); for (; i < haystack.size(); i+= 16) { arr1 = _mm_load_si128(reinterpret_cast<const __m128i*>(haystack.data() + i)); index = _mm_cmpestri( arr2, int(needles.size()), arr1, int(haystack.size() - i), 0); if (index < 16) { return i + index; } } return std::string::npos; }
void test8bit (void) { i1 = _mm_cmpistrm (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistri (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistra (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistrc (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistro (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistrs (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistrz (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ i1 = _mm_cmpestrm (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestri (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestra (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestrc (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestro (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestrs (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestrz (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ b1 = _mm256_blend_ps (b2, b3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ k1 = _cvtss_sh (f1, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm256_cvtps_ph (b2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ b1 = _mm256_dp_ps (b2, b3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ e1 = _mm256_permute2f128_pd (e2, e3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ b1 = _mm256_permute2f128_ps (b2, b3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ l1 = _mm256_permute2f128_si256 (l2, l3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ b1 = _mm256_permute_ps (b2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_aeskeygenassist_si128 (i2, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_blend_epi16 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_clmulepi64_si128 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_cvtps_ph (a1, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ d1 = _mm_dp_pd (d2, d3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ a1 = _mm_dp_ps (a2, a3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ a1 = _mm_insert_ps (a2, a3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_mpsadbw_epu8 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ a1 = _mm_permute_ps (a2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_slli_si128 (i2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_srli_si128 (i2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ }
static inline void _phe_escape_html(char *dst, const char *input, size_t input_size) { #if __SSE4_2__ const __m128i ranges = _mm_loadu_si128((const __m128i*) RANGES); int cursor = 0; __m128i v; do { v = _mm_loadu_si128((const __m128i*) input); cursor = _mm_cmpestri(ranges, RANGE_SIZE, v, 16, CMPESTRI_FLAG); if (cursor != 16) { if ((int) input_size >= 16) { memcpy(dst, input, 16); } else { memcpy(dst, input, cursor); } dst += cursor; const char c = input[cursor]; switch (c) { case '&': memcpy(dst, "&", 5); dst += 5; break; case '>': memcpy(dst, ">", 4); dst += 4; break; case '<': memcpy(dst, "<", 4); dst += 4; break; case '"': memcpy(dst, """, 6); dst += 6; break; case '\'': memcpy(dst, "'", 5); dst += 5; break; case '`': // For IE. IE interprets back-quote as valid quoting characters // ref: https://rt.cpan.org/Public/Bug/Display.html?id=84971 memcpy(dst, "`", 5); dst += 5; break; case '{': // For javascript templates (e.g. AngularJS and such javascript frameworks) // ref: https://github.com/angular/angular.js/issues/5601 memcpy(dst, "{", 6); dst += 6; break; case '}': // For javascript templates (e.g. AngularJS and such javascript frameworks) // ref: https://github.com/angular/angular.js/issues/5601 memcpy(dst, "}", 6); dst += 6; break; default: memcpy(dst, &c, 1); dst += 1; } const int next = cursor + 1; input += next; input_size -= next; continue; } memcpy(dst, input, 16); dst += 16; input += 16; input_size -= 16; } while((int) input_size > 0); *dst++ = *"\0"; #else const char *ptr = input, *end = input + input_size; const static int pp[UCHAR_MAX+1] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,4,0,0,0,1,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,2,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,8,0,0 /* following zero(s) */ }; const static char* dd[] = {NULL, "&", ">", "<", """, "'", "`", "{", "}"}; const static int dl[] = {0, 5, 4, 4, 6, 5, 5, 6, 6}; #define _ESC_AND_COPY(d,s,n) { memcpy(d,s,n); d += n; } while (ptr < end) { unsigned char c = *ptr++; int i = pp[c]; if (i == 0) *dst++ = c; else _ESC_AND_COPY(dst, dd[i], dl[i]); } #undef _ESC_AND_COPY *dst++ = 0; #endif }
static void TEST (void) { union { __m128i x[NUM]; char c[NUM *16]; } src1, src2; int res, correct, correct_flags, l1, l2; int flags, cf, zf, sf, of, af; int i; for (i = 0; i < NUM *16; i++) { src1.c[i] = rand (); src2.c[i] = rand (); } for (i = 0; i < NUM; i++) { l1 = rand () % 18; l2 = rand () % 18; switch ((rand () % 4)) { case 0: res = _mm_cmpestri (src1.x[i], l1, src2.x[i], l2, IMM_VAL0); cf = _mm_cmpestrc (src1.x[i], l1, src2.x[i], l2, IMM_VAL0); zf = _mm_cmpestrz (src1.x[i], l1, src2.x[i], l2, IMM_VAL0); sf = _mm_cmpestrs (src1.x[i], l1, src2.x[i], l2, IMM_VAL0); of = _mm_cmpestro (src1.x[i], l1, src2.x[i], l2, IMM_VAL0); af = _mm_cmpestra (src1.x[i], l1, src2.x[i], l2, IMM_VAL0); correct = cmp_ei (&src1.x[i], l1, &src2.x[i], l2, IMM_VAL0, &correct_flags); break; case 1: res = _mm_cmpestri (src1.x[i], l1, src2.x[i], l2, IMM_VAL1); cf = _mm_cmpestrc (src1.x[i], l1, src2.x[i], l2, IMM_VAL1); zf = _mm_cmpestrz (src1.x[i], l1, src2.x[i], l2, IMM_VAL1); sf = _mm_cmpestrs (src1.x[i], l1, src2.x[i], l2, IMM_VAL1); of = _mm_cmpestro (src1.x[i], l1, src2.x[i], l2, IMM_VAL1); af = _mm_cmpestra (src1.x[i], l1, src2.x[i], l2, IMM_VAL1); correct = cmp_ei (&src1.x[i], l1, &src2.x[i], l2, IMM_VAL1, &correct_flags); break; case 2: res = _mm_cmpestri (src1.x[i], l1, src2.x[i], l2, IMM_VAL2); cf = _mm_cmpestrc (src1.x[i], l1, src2.x[i], l2, IMM_VAL2); zf = _mm_cmpestrz (src1.x[i], l1, src2.x[i], l2, IMM_VAL2); sf = _mm_cmpestrs (src1.x[i], l1, src2.x[i], l2, IMM_VAL2); of = _mm_cmpestro (src1.x[i], l1, src2.x[i], l2, IMM_VAL2); af = _mm_cmpestra (src1.x[i], l1, src2.x[i], l2, IMM_VAL2); correct = cmp_ei (&src1.x[i], l1, &src2.x[i], l2, IMM_VAL2, &correct_flags); break; default: res = _mm_cmpestri (src1.x[i], l1, src2.x[i], l2, IMM_VAL3); cf = _mm_cmpestrc (src1.x[i], l1, src2.x[i], l2, IMM_VAL3); zf = _mm_cmpestrz (src1.x[i], l1, src2.x[i], l2, IMM_VAL3); sf = _mm_cmpestrs (src1.x[i], l1, src2.x[i], l2, IMM_VAL3); of = _mm_cmpestro (src1.x[i], l1, src2.x[i], l2, IMM_VAL3); af = _mm_cmpestra (src1.x[i], l1, src2.x[i], l2, IMM_VAL3); correct = cmp_ei (&src1.x[i], l1, &src2.x[i], l2, IMM_VAL3, &correct_flags); break; } if (correct != res) abort (); flags = 0; if (cf) flags |= CFLAG; if (zf) flags |= ZFLAG; if (sf) flags |= SFLAG; if (of) flags |= OFLAG; if (flags != correct_flags || (af && (cf || zf)) || (!af && !(cf || zf))) abort (); } }
int test_mm_cmpestri(__m128i A, int LA, __m128i B, int LB) { // CHECK-LABEL: test_mm_cmpestri // CHECK: call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %{{.*}}, i32 %{{.*}}, <16 x i8> %{{.*}}, i32 %{{.*}}, i8 7) return _mm_cmpestri(A, LA, B, LB, 7); }
int test_mm_cmpestri(__m128i A, int LA, __m128i B, int LB) { // CHECK-LABEL: test_mm_cmpestri // CHECK: @llvm.x86.sse42.pcmpestri128 return _mm_cmpestri(A, LA, B, LB, 7); }