static int pg_utf8_verifier(const unsigned char *s, int len) { int l = pg_utf_mblen(s); if (len < l) return -1; if (!pg_utf8_islegal(s, l)) return -1; return l; }
Datum utf8_to_iso8859_1(PG_FUNCTION_ARGS) { unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); unsigned short c, c1; Assert(PG_GETARG_INT32(0) == PG_UTF8); Assert(PG_GETARG_INT32(1) == PG_LATIN1); Assert(len >= 0); while (len > 0) { c = *src; if (c == 0) report_invalid_encoding(PG_UTF8, (const char *) src, len); /* fast path for ASCII-subset characters */ if (!IS_HIGHBIT_SET(c)) { *dest++ = c; src++; len--; } else { int l = pg_utf_mblen(src); if (l > len || !pg_utf8_islegal(src, l)) report_invalid_encoding(PG_UTF8, (const char *) src, len); if (l != 2) report_untranslatable_char(PG_UTF8, PG_LATIN1, (const char *) src, len); c1 = src[1] & 0x3f; c = ((c & 0x1f) << 6) | c1; if (c >= 0x80 && c <= 0xff) { *dest++ = (unsigned char) c; src += 2; len -= 2; } else report_untranslatable_char(PG_UTF8, PG_LATIN1, (const char *) src, len); } } *dest = '\0'; PG_RETURN_VOID(); }
/* * Calculate the length in characters of a null-terminated UTF-8 string. * * Returns -1 if the input is not valid UTF-8. */ static int pg_utf8_string_len(const char *source) { const unsigned char *p = (const unsigned char *) source; int l; int num_chars = 0; while (*p) { l = pg_utf_mblen(p); if (!pg_utf8_islegal(p, l)) return -1; p += l; num_chars++; } return num_chars; }
/* * UTF8 ---> local code * * utf: input UTF8 string (need not be null-terminated). * iso: pointer to the output area (must be large enough!) * map: the conversion map. * cmap: the conversion map for combined characters. * (optional) * size1: the size of the conversion map. * size2: the size of the conversion map for combined characters * (optional) * encoding: the PG identifier for the local encoding. * len: length of input string. */ void UtfToLocal(const unsigned char *utf, unsigned char *iso, const pg_utf_to_local *map, const pg_utf_to_local_combined *cmap, int size1, int size2, int encoding, int len) { uint32 iutf; uint32 cutf[2]; uint32 code; pg_utf_to_local *p; pg_utf_to_local_combined *cp; int l; for (; len > 0; len -= l) { /* "break" cases all represent errors */ if (*utf == '\0') break; l = pg_utf_mblen(utf); if (len < l) break; if (!pg_utf8_islegal(utf, l)) break; if (l == 1) { /* ASCII case is easy */ *iso++ = *utf++; continue; } else if (l == 2) { iutf = *utf++ << 8; iutf |= *utf++; } else if (l == 3) { iutf = *utf++ << 16; iutf |= *utf++ << 8; iutf |= *utf++; } else if (l == 4) { iutf = *utf++ << 24; iutf |= *utf++ << 16; iutf |= *utf++ << 8; iutf |= *utf++; } /* * first, try with combined map if possible */ if (cmap && len > l) { const unsigned char *utf_save = utf; int len_save = len; int l_save = l; len -= l; l = pg_utf_mblen(utf); if (len < l) break; if (!pg_utf8_islegal(utf, l)) break; cutf[0] = iutf; if (l == 1) { if (len_save > 1) { p = bsearch(&cutf[0], map, size1, sizeof(pg_utf_to_local), compare1); if (p == NULL) report_untranslatable_char(PG_UTF8, encoding, (const char *) (utf_save - l_save), len_save); iso = set_iso_code(iso, p->code); } /* ASCII case is easy */ *iso++ = *utf++; continue; } else if (l == 2) { iutf = *utf++ << 8; iutf |= *utf++; } else if (l == 3) { iutf = *utf++ << 16; iutf |= *utf++ << 8; iutf |= *utf++; } else if (l == 4) { iutf = *utf++ << 24; iutf |= *utf++ << 16; iutf |= *utf++ << 8; iutf |= *utf++; } cutf[1] = iutf; cp = bsearch(cutf, cmap, size2, sizeof(pg_utf_to_local_combined), compare3); if (cp) code = cp->code; else { /* not found in combined map. try with ordinary map */ p = bsearch(&cutf[0], map, size1, sizeof(pg_utf_to_local), compare1); if (p == NULL) report_untranslatable_char(PG_UTF8, encoding, (const char *) (utf_save - l_save), len_save); iso = set_iso_code(iso, p->code); p = bsearch(&cutf[1], map, size1, sizeof(pg_utf_to_local), compare1); if (p == NULL) report_untranslatable_char(PG_UTF8, encoding, (const char *) (utf - l), len); code = p->code; } } else /* no cmap or no remaining data */ { p = bsearch(&iutf, map, size1, sizeof(pg_utf_to_local), compare1); if (p == NULL) report_untranslatable_char(PG_UTF8, encoding, (const char *) (utf - l), len); code = p->code; } iso = set_iso_code(iso, code); } if (len > 0) report_invalid_encoding(PG_UTF8, (const char *) utf, len); *iso = '\0'; }
/* * UTF8 ---> local code * * utf: input string in UTF8 encoding (need not be null-terminated) * len: length of input string (in bytes) * iso: pointer to the output area (must be large enough!) (output string will be null-terminated) * map: conversion map for single characters * mapsize: number of entries in the conversion map * cmap: conversion map for combined characters * (optional, pass NULL if none) * cmapsize: number of entries in the conversion map for combined characters * (optional, pass 0 if none) * conv_func: algorithmic encoding conversion function * (optional, pass NULL if none) * encoding: PG identifier for the local encoding * * For each character, the cmap (if provided) is consulted first; if no match, * the map is consulted next; if still no match, the conv_func (if provided) * is applied. An error is raised if no match is found. * * See pg_wchar.h for more details about the data structures used here. */ void UtfToLocal(const unsigned char *utf, int len, unsigned char *iso, const pg_utf_to_local *map, int mapsize, const pg_utf_to_local_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding) { uint32 iutf; int l; const pg_utf_to_local *p; const pg_utf_to_local_combined *cp; if (!PG_VALID_ENCODING(encoding)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid encoding number: %d", encoding))); for (; len > 0; len -= l) { /* "break" cases all represent errors */ if (*utf == '\0') break; l = pg_utf_mblen(utf); if (len < l) break; if (!pg_utf8_islegal(utf, l)) break; if (l == 1) { /* ASCII case is easy, assume it's one-to-one conversion */ *iso++ = *utf++; continue; } /* collect coded char of length l */ if (l == 2) { iutf = *utf++ << 8; iutf |= *utf++; } else if (l == 3) { iutf = *utf++ << 16; iutf |= *utf++ << 8; iutf |= *utf++; } else if (l == 4) { iutf = *utf++ << 24; iutf |= *utf++ << 16; iutf |= *utf++ << 8; iutf |= *utf++; } else { elog(ERROR, "unsupported character length %d", l); iutf = 0; /* keep compiler quiet */ } /* First, try with combined map if possible */ if (cmap && len > l) { const unsigned char *utf_save = utf; int len_save = len; int l_save = l; /* collect next character, same as above */ len -= l; l = pg_utf_mblen(utf); if (len < l) break; if (!pg_utf8_islegal(utf, l)) break; /* We assume ASCII character cannot be in combined map */ if (l > 1) { uint32 iutf2; uint32 cutf[2]; if (l == 2) { iutf2 = *utf++ << 8; iutf2 |= *utf++; } else if (l == 3) { iutf2 = *utf++ << 16; iutf2 |= *utf++ << 8; iutf2 |= *utf++; } else if (l == 4) { iutf2 = *utf++ << 24; iutf2 |= *utf++ << 16; iutf2 |= *utf++ << 8; iutf2 |= *utf++; } else { elog(ERROR, "unsupported character length %d", l); iutf2 = 0; /* keep compiler quiet */ } cutf[0] = iutf; cutf[1] = iutf2; cp = bsearch(cutf, cmap, cmapsize, sizeof(pg_utf_to_local_combined), compare3); if (cp) { iso = store_coded_char(iso, cp->code); continue; } } /* fail, so back up to reprocess second character next time */ utf = utf_save; len = len_save; l = l_save; } /* Now check ordinary map */ p = bsearch(&iutf, map, mapsize, sizeof(pg_utf_to_local), compare1); if (p) { iso = store_coded_char(iso, p->code); continue; } /* if there's a conversion function, try that */ if (conv_func) { uint32 converted = (*conv_func) (iutf); if (converted) { iso = store_coded_char(iso, converted); continue; } } /* failed to translate this character */ report_untranslatable_char(PG_UTF8, encoding, (const char *) (utf - l), len); } /* if we broke out of loop early, must be invalid input */ if (len > 0) report_invalid_encoding(PG_UTF8, (const char *) utf, len); *iso = '\0'; }
/* * Verify mbstr to make sure that it has a valid character sequence. * mbstr is not necessarily NULL terminated; length of mbstr is * specified by len. * * If OK, return TRUE. If a problem is found, return FALSE when noError is * true; when noError is false, ereport() a descriptive message. */ bool pg_verifymbstr(const char *mbstr, int len, bool noError) { int l; int i; int encoding; /* we do not need any check in single-byte encodings */ if (pg_database_encoding_max_length() <= 1) return true; encoding = GetDatabaseEncoding(); while (len > 0 && *mbstr) { l = pg_mblen(mbstr); /* special UTF-8 check */ if (encoding == PG_UTF8) { if (!pg_utf8_islegal((const unsigned char *) mbstr, l)) { if (noError) return false; ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), errmsg("invalid UTF-8 byte sequence detected near byte 0x%02x", (unsigned char) *mbstr))); } } else { for (i = 1; i < l; i++) { /* * we expect that every multibyte char consists of bytes * having the 8th bit set */ if (i >= len || (mbstr[i] & 0x80) == 0) { char buf[8 * 2 + 1]; char *p = buf; int j, jlimit; if (noError) return false; jlimit = Min(l, len); jlimit = Min(jlimit, 8); /* prevent buffer overrun */ for (j = 0; j < jlimit; j++) p += sprintf(p, "%02x", (unsigned char) mbstr[j]); ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), errmsg("invalid byte sequence for encoding \"%s\": 0x%s", GetDatabaseEncodingName(), buf))); } } } len -= l; mbstr += l; } return true; }