static void TestAppend() { static const UChar32 codePoints[]={ 0x61, 0xdf, 0x901, 0x3040, 0xac00, 0xd800, 0xdbff, 0xdcde, 0xdffd, 0xe000, 0xffff, 0x10000, 0x12345, 0xe0021, 0x10ffff, 0x110000, 0x234567, 0x7fffffff, -1, -1000, 0, 0x400 }; static const uint8_t expectUnsafe[]={ 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80, 0xea, 0xb0, 0x80, 0xed, 0xa0, 0x80, 0xed, 0xaf, 0xbf, 0xed, 0xb3, 0x9e, 0xed, 0xbf, 0xbd, 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80, 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */ /* none from this line */ 0, 0xd0, 0x80 }, expectSafe[]={ 0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80, 0xea, 0xb0, 0x80, /* no surrogates */ /* no surrogates */ 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80, 0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */ /* none from this line */ 0, 0xd0, 0x80 }; uint8_t buffer[100]; UChar32 c; int32_t i, length; UBool isError, expectIsError, wrongIsError; length=0; for(i=0; i<LENGTHOF(codePoints); ++i) { c=codePoints[i]; if(c<0 || 0x10ffff<c) { continue; /* skip non-code points for U8_APPEND_UNSAFE */ } U8_APPEND_UNSAFE(buffer, length, c); } if(length!=LENGTHOF(expectUnsafe) || 0!=memcmp(buffer, expectUnsafe, length)) { log_err("U8_APPEND_UNSAFE did not generate the expected output\n"); } length=0; wrongIsError=FALSE; for(i=0; i<LENGTHOF(codePoints); ++i) { c=codePoints[i]; expectIsError= c<0 || 0x10ffff<c || U_IS_SURROGATE(c); isError=FALSE; U8_APPEND(buffer, length, LENGTHOF(buffer), c, isError); wrongIsError|= isError!=expectIsError; } if(wrongIsError) { log_err("U8_APPEND did not set isError correctly\n"); } if(length!=LENGTHOF(expectSafe) || 0!=memcmp(buffer, expectSafe, length)) { log_err("U8_APPEND did not generate the expected output\n"); } }
void BUnicodeChar::ToUTF8(uint32 c, char **out) { int i = 0; U8_APPEND_UNSAFE(*out, i, c); *out += i; }
Variant HHVM_STATIC_METHOD(IntlChar, foldCase, const Variant& arg, int64_t options) { GETCP(arg, cp); auto ret = u_foldCase(cp, options); if (arg.isString()) { char buffer[5]; int buffer_len = 0; U8_APPEND_UNSAFE(buffer, buffer_len, ret); return String(buffer, buffer_len, CopyString); } else { return ret; } }
Variant uchar_method(Class* self_, const Variant& arg) { GETCP(arg, cp); auto ret = T(cp); if (arg.isString()) { String buf(5, ReserveString); auto s = buf.bufferSlice().ptr; int s_len = 0; U8_APPEND_UNSAFE(s, s_len, ret); s[s_len] = 0; buf.setSize(s_len); return buf; } else { return ret; } }
void ICUUnicodeSupport::_toLowerCase<1>(StringHolder<1> _str) { if(!_str.empty()) { uint8_t* buf = &_str[0]; int32_t len = _str.length(); int32_t ofs = 0, ofs2 = 0; while(ofs != len) { UChar32 c; U8_NEXT(buf, ofs, len, c); c = u_tolower(c); U8_APPEND_UNSAFE( buf, ofs2, c); } } }
CString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling) { // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3. // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x). // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x). if (length > numeric_limits<size_t>::max() / 3) CRASH(); Vector<uint8_t> bytes(length * 3); size_t i = 0; size_t bytesWritten = 0; while (i < length) { UChar32 character; U16_NEXT(characters, i, length, character); U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character); } return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten); }
SpanBackUTF8(const UnicodeSetPerformanceTest &testcase) : Command(testcase) { // Verify that the frozen set is equal to the unfrozen one. UnicodeSet set; char utf8[4]; UChar32 c; int32_t length; for(c=0; c<=0x10ffff; ++c) { if(c==0xd800) { c=0xe000; } length=0; U8_APPEND_UNSAFE(utf8, length, c); if(testcase.set.spanBackUTF8(utf8, length, USET_SPAN_CONTAINED)==0) { set.add(c); } } if(set!=testcase.set) { fprintf(stderr, "error: frozen set != original!\n"); } }
CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length) { // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3. // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x). // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x). if (length > numeric_limits<size_t>::max() / 3) CRASH(); Vector<uint8_t> bytes(length * 3); size_t i = 0; size_t bytesWritten = 0; while (i < length) { UChar32 character; U16_NEXT(characters, i, length, character); // U16_NEXT will simply emit a surrogate code point if an unmatched surrogate // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER) here. if (0xD800 <= character && character <= 0xDFFF) character = replacementCharacter; U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character); } return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten); }