static SgString* utf16ToUtf32(wchar_t *s) { const SgChar offset = (0xd800 << 10UL) + 0xdc00 - 0x10000; size_t i = 0, n = wcslen(s); SgPort *out; SgStringPort tp; SgObject r; out = Sg_InitStringOutputPort(&tp, (int)n); while (i < n) { SgChar c0 = s[i++]; if (isLead(c0)) { SgChar c1; if (i < n && isTrail((c1 = s[i]))) { i++; c0 = (c0 << 10) + c1 - offset; } else { return SG_MAKE_STRING("bad char"); } } Sg_PutcUnsafe(out, c0); } r = Sg_GetStringFromStringPort(&tp); SG_CLEAN_STRING_PORT(&tp); return r; }
static SgString* utf16ToUtf32WithRegion(wchar_t *s, wchar_t *e) { const SgChar offset = (0xd800 << 10UL) + 0xdc00 - 0x10000; SgPort *out; SgStringPort tp; SgObject r; out = Sg_InitStringOutputPort(&tp, (int)((e - s) * 2)); while (s < e) { SgChar c0 = *s++; if (isLead(c0)) { SgChar c1; if (s < e && isTrail((c1 = *s))) { s++; c0 = (c0 << 10) + c1 - offset; } else { return SG_MAKE_STRING("bad char"); } } Sg_PutcUnsafe(out, c0); } r = Sg_GetStringFromStringPort(&tp); SG_CLEAN_STRING_PORT(&tp); return r; }
rbool utf8GetNextCharacter(const byte *string, size_t stringSize, size_t *cursor, rbool *isValid, unsigned *codePoint) { size_t position = *cursor; size_t rest_size = stringSize - position; byte character; byte min; byte max; *codePoint = 0; *isValid = yes; if (*cursor >= stringSize) { return no; } character = string[position]; if (rest_size < 1) { *isValid = no; position += 1; } else if (character < (byte)0x80) { *codePoint = string[position]; *isValid = yes; position += 1; } else if (character < (byte)0xC2) { *isValid = no; position += 1; } else if (character < (byte)0xE0) { if (rest_size < 2 || !isTrail(string[position + 1])) { *isValid = no; position += 1; } else { *codePoint = ((string[position] & (byte)0x1F) << 6) | (string[position + 1] & (byte)0x3F); *isValid = yes; position += 2; } } else if (character < (byte)0xF0) { min = (character == (byte)0xE0) ? (byte)0xA0 : (byte)0x80; max = (character == (byte)0xED) ? (byte)0x9F : (byte)0xBF; if (rest_size < 2 || string[position + 1] < min || max < string[position + 1]) { *isValid = no; position += 1; } else if (rest_size < 3 || !isTrail(string[position + 2])) { *isValid = no; position += 2; } else { *codePoint = ((string[position] & (byte)0x1F) << 12) | ((string[position + 1] & (byte)0x3F) << 6) | (string[position + 2] & (byte)0x3F); *isValid = yes; position += 3; } } else if (character < (byte)0xF5) { min = (character == (byte)0xF0) ? (byte)0x90 : (byte)0x80; max = (character == (byte)0xF4) ? (byte)0x8F : (byte)0xBF; if (rest_size < 2 || string[position + 1] < min || max < string[position + 1]) { *isValid = no; position += 1; } else if (rest_size < 3 || !isTrail(string[position + 2])) { *isValid = no; position += 2; } else if (rest_size < 4 || !isTrail(string[position + 3])) { *isValid = no; position += 3; } else { *codePoint = ((string[position] & (byte)0x7 ) << 18) | ((string[position + 1] & (byte)0x3F) << 12) | ((string[position + 2] & (byte)0x3F) << 6) | (string[position + 3] & (byte)0x3F); *isValid = yes; position += 4; } } else { *isValid = no; position += 1; } *cursor = position; return yes; }