inline void u32toa_sse2(uint32_t value, char* buffer) { if (value < 10000) { const uint32_t d1 = (value / 100) << 1; const uint32_t d2 = (value % 100) << 1; if (value >= 1000) *buffer++ = gDigitsLut[d1]; if (value >= 100) *buffer++ = gDigitsLut[d1 + 1]; if (value >= 10) *buffer++ = gDigitsLut[d2]; *buffer++ = gDigitsLut[d2 + 1]; *buffer++ = '\0'; } else if (value < 100000000) { // Experiment shows that this case SSE2 is slower #if 0 const __m128i a = Convert8DigitsSSE2(value); // Convert to bytes, add '0' const __m128i va = _mm_add_epi8(_mm_packus_epi16(a, _mm_setzero_si128()), reinterpret_cast<const __m128i*>(kAsciiZero)[0]); // Count number of digit const unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(va, reinterpret_cast<const __m128i*>(kAsciiZero)[0])); unsigned long digit; #ifdef _MSC_VER _BitScanForward(&digit, ~mask | 0x8000); #else digit = __builtin_ctz(~mask | 0x8000); #endif // Shift digits to the beginning __m128i result = ShiftDigits_SSE2(va, digit); //__m128i result = _mm_srl_epi64(va, _mm_cvtsi32_si128(digit * 8)); _mm_storel_epi64(reinterpret_cast<__m128i*>(buffer), result); buffer[8 - digit] = '\0'; #else // value = bbbbcccc const uint32_t b = value / 10000; const uint32_t c = value % 10000; const uint32_t d1 = (b / 100) << 1; const uint32_t d2 = (b % 100) << 1; const uint32_t d3 = (c / 100) << 1; const uint32_t d4 = (c % 100) << 1; if (value >= 10000000) *buffer++ = gDigitsLut[d1]; if (value >= 1000000) *buffer++ = gDigitsLut[d1 + 1]; if (value >= 100000) *buffer++ = gDigitsLut[d2]; *buffer++ = gDigitsLut[d2 + 1]; *buffer++ = gDigitsLut[d3]; *buffer++ = gDigitsLut[d3 + 1]; *buffer++ = gDigitsLut[d4]; *buffer++ = gDigitsLut[d4 + 1]; *buffer++ = '\0'; #endif } else { // value = aabbbbbbbb in decimal const uint32_t a = value / 100000000; // 1 to 42 value %= 100000000; if (a >= 10) { const unsigned i = a << 1; *buffer++ = gDigitsLut[i]; *buffer++ = gDigitsLut[i + 1]; } else *buffer++ = '0' + static_cast<char>(a); const __m128i b = Convert8DigitsSSE2(value); const __m128i ba = _mm_add_epi8(_mm_packus_epi16(_mm_setzero_si128(), b), reinterpret_cast<const __m128i*>(kAsciiZero)[0]); const __m128i result = _mm_srli_si128(ba, 8); _mm_storel_epi64(reinterpret_cast<__m128i*>(buffer), result); buffer[8] = '\0'; } }
static uint32_t utoa32_sse2(uint32_t value, char* buffer) { char* start = buffer; if (value < 10000) { const uint32_t d1 = (value / 100) << 1; const uint32_t d2 = (value % 100) << 1; if (value >= 1000) *buffer++ = u_ctn2s[d1]; if (value >= 100) *buffer++ = u_ctn2s[d1+1]; if (value >= 10) *buffer++ = u_ctn2s[d2]; *buffer++ = u_ctn2s[d2+1]; return (buffer - start); } if (value < 100000000) { // Experiment shows that in this case SSE2 is slower # if 0 const __m128i a = Convert8DigitsSSE2(value); // Convert to bytes, add '0' const __m128i va = _mm_add_epi8(_mm_packus_epi16(a, _mm_setzero_si128()), reinterpret_cast<const __m128i*>(kAsciiZero)[0]); // Count number of digit const unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(va, reinterpret_cast<const __m128i*>(kAsciiZero)[0])); unsigned long digit; # ifdef _MSC_VER _BitScanForward(&digit, ~mask | 0x8000); # else digit = __builtin_ctz(~mask | 0x8000); # endif // Shift digits to the beginning __m128i result = ShiftDigits_SSE2(va, digit); //__m128i result = _mm_srl_epi64(va, _mm_cvtsi32_si128(digit * 8)); _mm_storel_epi64(reinterpret_cast<__m128i*>(buffer), result); return (buffer + 8 - digit - start); # else // value = bbbbcccc const uint32_t b = value / 10000; const uint32_t c = value % 10000; const uint32_t d1 = (b / 100) << 1; const uint32_t d2 = (b % 100) << 1; const uint32_t d3 = (c / 100); const uint32_t d4 = (c % 100); if (value >= 10000000) *buffer++ = u_ctn2s[d1]; if (value >= 1000000) *buffer++ = u_ctn2s[d1+1]; if (value >= 100000) *buffer++ = u_ctn2s[d2]; *buffer++ = u_ctn2s[d2+1]; U_NUM2STR16(buffer, d3); U_NUM2STR16(buffer+2, d4); return (buffer + 4 - start); # endif } // value = aabbbbbbbb in decimal const uint32_t a = value / 100000000; // 1 to 42 value %= 100000000; if (a < 10) *buffer++ = '0' + (char)a; else { U_NUM2STR16(buffer, a); buffer += 2; } const __m128i b = Convert8DigitsSSE2(value); const __m128i ba = _mm_add_epi8(_mm_packus_epi16(_mm_setzero_si128(), b), reinterpret_cast<const __m128i*>(kAsciiZero)[0]); const __m128i result = _mm_srli_si128(ba, 8); _mm_storel_epi64(reinterpret_cast<__m128i*>(buffer), result); return (buffer + 8 - start); }
inline void u64toa_sse2(uint64_t value, char* buffer) { if (value < 100000000) { uint32_t v = static_cast<uint32_t>(value); if (v < 10000) { const uint32_t d1 = (v / 100) << 1; const uint32_t d2 = (v % 100) << 1; if (v >= 1000) *buffer++ = gDigitsLut[d1]; if (v >= 100) *buffer++ = gDigitsLut[d1 + 1]; if (v >= 10) *buffer++ = gDigitsLut[d2]; *buffer++ = gDigitsLut[d2 + 1]; *buffer++ = '\0'; } else { // Experiment shows that this case SSE2 is slower #if 0 const __m128i a = Convert8DigitsSSE2(v); // Convert to bytes, add '0' const __m128i va = _mm_add_epi8(_mm_packus_epi16(a, _mm_setzero_si128()), reinterpret_cast<const __m128i*>(kAsciiZero)[0]); // Count number of digit const unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(va, reinterpret_cast<const __m128i*>(kAsciiZero)[0])); unsigned long digit; #ifdef _MSC_VER _BitScanForward(&digit, ~mask | 0x8000); #else digit = __builtin_ctz(~mask | 0x8000); #endif // Shift digits to the beginning __m128i result = ShiftDigits_SSE2(va, digit); _mm_storel_epi64(reinterpret_cast<__m128i*>(buffer), result); buffer[8 - digit] = '\0'; #else // value = bbbbcccc const uint32_t b = v / 10000; const uint32_t c = v % 10000; const uint32_t d1 = (b / 100) << 1; const uint32_t d2 = (b % 100) << 1; const uint32_t d3 = (c / 100) << 1; const uint32_t d4 = (c % 100) << 1; if (value >= 10000000) *buffer++ = gDigitsLut[d1]; if (value >= 1000000) *buffer++ = gDigitsLut[d1 + 1]; if (value >= 100000) *buffer++ = gDigitsLut[d2]; *buffer++ = gDigitsLut[d2 + 1]; *buffer++ = gDigitsLut[d3]; *buffer++ = gDigitsLut[d3 + 1]; *buffer++ = gDigitsLut[d4]; *buffer++ = gDigitsLut[d4 + 1]; *buffer++ = '\0'; #endif } } else if (value < 10000000000000000) { const uint32_t v0 = static_cast<uint32_t>(value / 100000000); const uint32_t v1 = static_cast<uint32_t>(value % 100000000); const __m128i a0 = Convert8DigitsSSE2(v0); const __m128i a1 = Convert8DigitsSSE2(v1); // Convert to bytes, add '0' const __m128i va = _mm_add_epi8(_mm_packus_epi16(a0, a1), reinterpret_cast<const __m128i*>(kAsciiZero)[0]); // Count number of digit const unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(va, reinterpret_cast<const __m128i*>(kAsciiZero)[0])); unsigned long digit; #ifdef _MSC_VER _BitScanForward(&digit, ~mask | 0x8000); #else digit = __builtin_ctz(~mask | 0x8000); #endif // Shift digits to the beginning __m128i result = ShiftDigits_SSE2(va, digit); _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); buffer[16 - digit] = '\0'; } else { const uint32_t a = static_cast<uint32_t>(value / 10000000000000000); // 1 to 1844 value %= 10000000000000000; if (a < 10) *buffer++ = '0' + static_cast<char>(a); else if (a < 100) { const uint32_t i = a << 1; *buffer++ = gDigitsLut[i]; *buffer++ = gDigitsLut[i + 1]; } else if (a < 1000) { *buffer++ = '0' + static_cast<char>(a / 100); const uint32_t i = (a % 100) << 1; *buffer++ = gDigitsLut[i]; *buffer++ = gDigitsLut[i + 1]; } else { const uint32_t i = (a / 100) << 1; const uint32_t j = (a % 100) << 1; *buffer++ = gDigitsLut[i]; *buffer++ = gDigitsLut[i + 1]; *buffer++ = gDigitsLut[j]; *buffer++ = gDigitsLut[j + 1]; } const uint32_t v0 = static_cast<uint32_t>(value / 100000000); const uint32_t v1 = static_cast<uint32_t>(value % 100000000); const __m128i a0 = Convert8DigitsSSE2(v0); const __m128i a1 = Convert8DigitsSSE2(v1); // Convert to bytes, add '0' const __m128i va = _mm_add_epi8(_mm_packus_epi16(a0, a1), reinterpret_cast<const __m128i*>(kAsciiZero)[0]); _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), va); buffer[16] = '\0'; } }
static uint32_t utoa64_sse2(uint64_t value, char* buffer) { char* start = buffer; if (value < 100000000) { uint32_t v = static_cast<uint32_t>(value); if (v < 10000) { const uint32_t d1 = (v / 100) << 1; const uint32_t d2 = (v % 100) << 1; if (v >= 1000) *buffer++ = u_ctn2s[d1]; if (v >= 100) *buffer++ = u_ctn2s[d1+1]; if (v >= 10) *buffer++ = u_ctn2s[d2]; *buffer++ = u_ctn2s[d2+1]; return (buffer - start); } // Experiment shows that in this case SSE2 is slower # if 0 const __m128i a = Convert8DigitsSSE2(v); // Convert to bytes, add '0' const __m128i va = _mm_add_epi8(_mm_packus_epi16(a, _mm_setzero_si128()), reinterpret_cast<const __m128i*>(kAsciiZero)[0]); // Count number of digit const unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(va, reinterpret_cast<const __m128i*>(kAsciiZero)[0])); unsigned long digit; # ifdef _MSC_VER _BitScanForward(&digit, ~mask | 0x8000); # else digit = __builtin_ctz(~mask | 0x8000); # endif // Shift digits to the beginning __m128i result = ShiftDigits_SSE2(va, digit); _mm_storel_epi64(reinterpret_cast<__m128i*>(buffer), result); return (buffer + 8 - digit - start); # else // value = bbbbcccc const uint32_t b = v / 10000; const uint32_t c = v % 10000; const uint32_t d1 = (b / 100) << 1; const uint32_t d2 = (b % 100) << 1; const uint32_t d3 = (c / 100); const uint32_t d4 = (c % 100); if (value >= 10000000) *buffer++ = u_ctn2s[d1]; if (value >= 1000000) *buffer++ = u_ctn2s[d1+1]; if (value >= 100000) *buffer++ = u_ctn2s[d2]; *buffer++ = u_ctn2s[d2+1]; U_NUM2STR16(buffer, d3); U_NUM2STR16(buffer+2, d4); return (buffer + 4 - start); # endif } if (value < 10000000000000000) { const uint32_t v0 = static_cast<uint32_t>(value / 100000000); const uint32_t v1 = static_cast<uint32_t>(value % 100000000); const __m128i a0 = Convert8DigitsSSE2(v0); const __m128i a1 = Convert8DigitsSSE2(v1); // Convert to bytes, add '0' const __m128i va = _mm_add_epi8(_mm_packus_epi16(a0, a1), reinterpret_cast<const __m128i*>(kAsciiZero)[0]); // Count number of digit const unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(va, reinterpret_cast<const __m128i*>(kAsciiZero)[0])); # ifdef _MSC_VER unsigned long digit; _BitScanForward(&digit, ~mask | 0x8000); # else unsigned digit = __builtin_ctz(~mask | 0x8000); # endif // Shift digits to the beginning __m128i result = ShiftDigits_SSE2(va, digit); _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); return (buffer + 16 - digit - start); } const uint32_t a = static_cast<uint32_t>(value / 10000000000000000); // 1 to 1844 value %= 10000000000000000; if (a < 10) *buffer++ = '0' + (char)a; else if (a < 100) { U_NUM2STR16(buffer, a); buffer += 2; } else if (a < 1000) { *buffer++ = '0' + static_cast<char>(a / 100); const uint32_t i = (a % 100); U_NUM2STR16(buffer, i); buffer += 2; } else { const uint32_t i = (a / 100); const uint32_t j = (a % 100); U_NUM2STR16(buffer, i); U_NUM2STR16(buffer+2, j); buffer += 4; } const uint32_t v0 = static_cast<uint32_t>(value / 100000000); const uint32_t v1 = static_cast<uint32_t>(value % 100000000); const __m128i a0 = Convert8DigitsSSE2(v0); const __m128i a1 = Convert8DigitsSSE2(v1); // Convert to bytes, add '0' const __m128i va = _mm_add_epi8(_mm_packus_epi16(a0, a1), reinterpret_cast<const __m128i*>(kAsciiZero)[0]); _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), va); return (buffer + 16 - start); }