// Validate a single UTF-8 character according to RFC 3629. static int utf8proc_valid(const uint8_t *str, int str_len) { int length = utf8proc_charlen(str, str_len); if (length <= 0) return length; switch (length) { case 1: if (str[0] == 0x00) { // ASCII NUL is technically valid but rejected // for security reasons. return -length; } break; case 2: if (str[0] < 0xC2) { // Overlong return -length; } break; case 3: if (str[0] == 0xE0) { if (str[1] < 0xA0) { // Overlong return -length; } } else if (str[0] == 0xED) { if (str[1] >= 0xA0) { // Surrogate return -length; } } break; case 4: if (str[0] == 0xF0) { if (str[1] < 0x90) { // Overlong return -length; } } else if (str[0] >= 0xF4) { if (str[0] > 0xF4 || str[1] >= 0x90) { // Above 0x10FFFF return -length; } } break; } return length; }
void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size) { static const uint8_t whitespace[] = " "; size_t i = 0, tab = 0; while (i < size) { size_t org = i; while (i < size && line[i] != '\t' && line[i] <= 0x80) { i++; tab++; } if (i > org) strbuf_put(ob, line + org, i - org); if (i >= size) break; if (line[i] == '\t') { int numspaces = 4 - (tab % 4); strbuf_put(ob, whitespace, numspaces); i += 1; tab += numspaces; } else { int charlen = utf8proc_charlen(line + i, size - i); if (charlen >= 0) { strbuf_put(ob, line + i, charlen); } else { encode_unknown(ob); charlen = -charlen; } i += charlen; tab += 1; } } }
int utf8proc_iterate(const uint8_t *str, int str_len, int32_t *dst) { int length; int32_t uc = -1; *dst = -1; length = utf8proc_charlen(str, str_len); if (length < 0) return -1; switch (length) { case 1: uc = str[0]; break; case 2: uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F); if (uc < 0x80) uc = -1; break; case 3: uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6) + (str[2] & 0x3F); if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) || (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1; break; case 4: uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12) + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F); if (uc < 0x10000 || uc >= 0x110000) uc = -1; break; } if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE)) return -1; *dst = uc; return length; }