/* * EUC_KR ---> MIC */ static void euc_kr2mic(const unsigned char *euc, unsigned char *p, int len) { int c1; int l; while (len > 0) { c1 = *euc; if (IS_HIGHBIT_SET(c1)) { l = pg_encoding_verifymb(PG_EUC_KR, (const char *) euc, len); if (l != 2) report_invalid_encoding(PG_EUC_KR, (const char *) euc, len); *p++ = LC_KS5601; *p++ = c1; *p++ = euc[1]; euc += 2; len -= 2; } else { /* should be ASCII */ if (c1 == 0) report_invalid_encoding(PG_EUC_KR, (const char *) euc, len); *p++ = c1; euc++; len--; } } *p = '\0'; }
/* * EUC_CN ---> MIC */ static void euc_cn2mic(const unsigned char *euc, unsigned char *p, int len) { int c1; while (len > 0) { c1 = *euc; if (IS_HIGHBIT_SET(c1)) { if (len < 2 || !IS_HIGHBIT_SET(euc[1])) report_invalid_encoding(PG_EUC_CN, (const char *) euc, len); *p++ = LC_GB2312_80; *p++ = c1; *p++ = euc[1]; euc += 2; len -= 2; } else { /* should be ASCII */ if (c1 == 0) report_invalid_encoding(PG_EUC_CN, (const char *) euc, len); *p++ = c1; euc++; len--; } } *p = '\0'; }
/* * MIC ---> EUC_CN */ static void mic2euc_cn(const unsigned char *mic, unsigned char *p, int len) { int c1; while (len > 0) { c1 = *mic; if (IS_HIGHBIT_SET(c1)) { if (c1 != LC_GB2312_80) report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_CN, (const char *) mic, len); if (len < 3 || !IS_HIGHBIT_SET(mic[1]) || !IS_HIGHBIT_SET(mic[2])) report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); mic++; *p++ = *mic++; *p++ = *mic++; len -= 3; } else { /* should be ASCII */ if (c1 == 0) report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); *p++ = c1; mic++; len--; } } *p = '\0'; }
/* * MIC ---> LATINn when the charset's local codes map directly to MIC * * mic points to the source string of length len * p is the output area (must be large enough!) * lc is the mule character set id for the local encoding * encoding is the PG identifier for the local encoding */ void mic2latin(const unsigned char *mic, unsigned char *p, int len, int lc, int encoding) { int c1; while (len > 0) { c1 = *mic; if (c1 == 0) report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); if (!IS_HIGHBIT_SET(c1)) { /* easy for ASCII */ *p++ = c1; mic++; len--; } else { int l = pg_mic_mblen(mic); if (len < l) report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1])) report_untranslatable_char(PG_MULE_INTERNAL, encoding, (const char *) mic, len); *p++ = mic[1]; mic += 2; len -= 2; } } *p = '\0'; }
/* * Verify mbstr to make sure that it is validly encoded in the specified * encoding. * * mbstr is not necessarily zero terminated; length of mbstr is * specified by len. * * If OK, return TRUE. If a problem is found, return FALSE when noError is * true; when noError is false, ereport() a descriptive message. */ bool pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError) { mbverifier mbverify; Assert(PG_VALID_ENCODING(encoding)); /* * In single-byte encodings, we need only reject nulls (\0). */ if (pg_encoding_max_length(encoding) <= 1) { const char *nullpos = memchr(mbstr, 0, len); if (nullpos == NULL) return true; if (noError) return false; report_invalid_encoding(encoding, nullpos, 1); } /* fetch function pointer just once */ mbverify = pg_wchar_table[encoding].mbverify; while (len > 0) { int l; /* fast path for ASCII-subset characters */ if (!IS_HIGHBIT_SET(*mbstr)) { if (*mbstr != '\0') { mbstr++; len--; continue; } if (noError) return false; report_invalid_encoding(encoding, mbstr, len); } l = (*mbverify) ((const unsigned char *) mbstr, len); if (l < 0) { if (noError) return false; report_invalid_encoding(encoding, mbstr, len); } mbstr += l; len -= l; } return true; }
/* * EUC_TW ---> MIC */ static void euc_tw2mic(const unsigned char *euc, unsigned char *p, int len) { int c1; int l; while (len > 0) { c1 = *euc; if (IS_HIGHBIT_SET(c1)) { l = pg_encoding_verifymb(PG_EUC_TW, (const char *) euc, len); if (l < 0) report_invalid_encoding(PG_EUC_TW, (const char *) euc, len); if (c1 == SS2) { c1 = euc[1]; /* plane No. */ if (c1 == 0xa1) *p++ = LC_CNS11643_1; else if (c1 == 0xa2) *p++ = LC_CNS11643_2; else { /* other planes are MULE private charsets */ *p++ = LCPRV2_B; *p++ = c1 - 0xa3 + LC_CNS11643_3; } *p++ = euc[2]; *p++ = euc[3]; } else { /* CNS11643-1 */ *p++ = LC_CNS11643_1; *p++ = c1; *p++ = euc[1]; } euc += l; len -= l; } else { /* should be ASCII */ if (c1 == 0) report_invalid_encoding(PG_EUC_TW, (const char *) euc, len); *p++ = c1; euc++; len--; } } *p = '\0'; }
/* * MIC ---> EUC_TW */ static void mic2euc_tw(const unsigned char *mic, unsigned char *p, int len) { int c1; int l; while (len > 0) { c1 = *mic; if (!IS_HIGHBIT_SET(c1)) { /* ASCII */ if (c1 == 0) report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); *p++ = c1; mic++; len--; continue; } l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len); if (l < 0) report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); if (c1 == LC_CNS11643_1) { *p++ = mic[1]; *p++ = mic[2]; } else if (c1 == LC_CNS11643_2) { *p++ = SS2; *p++ = 0xa2; *p++ = mic[1]; *p++ = mic[2]; } else if (c1 == 0x9d && mic[1] >= LC_CNS11643_3 && mic[1] <= LC_CNS11643_7) { /* LCPRV2? */ *p++ = SS2; *p++ = mic[1] - LC_CNS11643_3 + 0xa3; *p++ = mic[2]; *p++ = mic[3]; } else report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_TW, (const char *) mic, len); mic += l; len -= l; } *p = '\0'; }
Datum utf8_to_iso8859_1(PG_FUNCTION_ARGS) { unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); unsigned short c, c1; Assert(PG_GETARG_INT32(0) == PG_UTF8); Assert(PG_GETARG_INT32(1) == PG_LATIN1); Assert(len >= 0); while (len > 0) { c = *src; if (c == 0) report_invalid_encoding(PG_UTF8, (const char *) src, len); /* fast path for ASCII-subset characters */ if (!IS_HIGHBIT_SET(c)) { *dest++ = c; src++; len--; } else { int l = pg_utf_mblen(src); if (l > len || !pg_utf8_islegal(src, l)) report_invalid_encoding(PG_UTF8, (const char *) src, len); if (l != 2) report_untranslatable_char(PG_UTF8, PG_LATIN1, (const char *) src, len); c1 = src[1] & 0x3f; c = ((c & 0x1f) << 6) | c1; if (c >= 0x80 && c <= 0xff) { *dest++ = (unsigned char) c; src += 2; len -= 2; } else report_untranslatable_char(PG_UTF8, PG_LATIN1, (const char *) src, len); } } *dest = '\0'; PG_RETURN_VOID(); }
/* * latin2mic_with_table: a generic single byte charset encoding * conversion from a local charset to the mule internal code. * * l points to the source string of length len * p is the output area (must be large enough!) * lc is the mule character set id for the local encoding * encoding is the PG identifier for the local encoding * tab holds conversion entries for the local charset * starting from 128 (0x80). each entry in the table * holds the corresponding code point for the mule internal code. */ void latin2mic_with_table(const unsigned char *l, unsigned char *p, int len, int lc, int encoding, const unsigned char *tab) { unsigned char c1, c2; while (len > 0) { c1 = *l; if (c1 == 0) report_invalid_encoding(encoding, (const char *) l, len); if (!IS_HIGHBIT_SET(c1)) *p++ = c1; else { c2 = tab[c1 - HIGHBIT]; if (c2) { *p++ = lc; *p++ = c2; } else report_untranslatable_char(encoding, PG_MULE_INTERNAL, (const char *) l, len); } l++; len--; } *p = '\0'; }
/* * local2local: a generic single byte charset encoding * conversion between two ASCII-superset encodings. * * l points to the source string of length len * p is the output area (must be large enough!) * src_encoding is the PG identifier for the source encoding * dest_encoding is the PG identifier for the target encoding * tab holds conversion entries for the source charset * starting from 128 (0x80). each entry in the table holds the corresponding * code point for the target charset, or 0 if there is no equivalent code. */ void local2local(const unsigned char *l, unsigned char *p, int len, int src_encoding, int dest_encoding, const unsigned char *tab) { unsigned char c1, c2; while (len > 0) { c1 = *l; if (c1 == 0) report_invalid_encoding(src_encoding, (const char *) l, len); if (!IS_HIGHBIT_SET(c1)) *p++ = c1; else { c2 = tab[c1 - HIGHBIT]; if (c2) *p++ = c2; else report_untranslatable_char(src_encoding, dest_encoding, (const char *) l, len); } l++; len--; } *p = '\0'; }
Datum iso8859_1_to_utf8(PG_FUNCTION_ARGS) { unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); unsigned short c; Assert(PG_GETARG_INT32(0) == PG_LATIN1); Assert(PG_GETARG_INT32(1) == PG_UTF8); Assert(len >= 0); while (len > 0) { c = *src; if (c == 0) report_invalid_encoding(PG_LATIN1, (const char *) src, len); if (!IS_HIGHBIT_SET(c)) *dest++ = c; else { *dest++ = (c >> 6) | 0xc0; *dest++ = (c & 0x003f) | HIGHBIT; } src++; len--; } *dest = '\0'; PG_RETURN_VOID(); }
/* * Big5 ---> MIC */ static void big52mic(const unsigned char *big5, unsigned char *p, int len) { unsigned short c1; unsigned short big5buf, cnsBuf; unsigned char lc; int l; while (len > 0) { c1 = *big5; if (!IS_HIGHBIT_SET(c1)) { /* ASCII */ if (c1 == 0) report_invalid_encoding(PG_BIG5, (const char *) big5, len); *p++ = c1; big5++; len--; continue; } l = pg_encoding_verifymb(PG_BIG5, (const char *) big5, len); if (l < 0) report_invalid_encoding(PG_BIG5, (const char *) big5, len); big5buf = (c1 << 8) | big5[1]; cnsBuf = BIG5toCNS(big5buf, &lc); if (lc != 0) { if (lc == LC_CNS11643_3 || lc == LC_CNS11643_4) { *p++ = 0x9d; /* LCPRV2 */ } *p++ = lc; /* Plane No. */ *p++ = (cnsBuf >> 8) & 0x00ff; *p++ = cnsBuf & 0x00ff; } else report_untranslatable_char(PG_BIG5, PG_MULE_INTERNAL, (const char *) big5, len); big5 += l; len -= l; }
/* * mic2latin_with_table: a generic single byte charset encoding * conversion from the mule internal code to a local charset. * * mic points to the source string of length len * p is the output area (must be large enough!) * lc is the mule character set id for the local encoding * encoding is the PG identifier for the local encoding * tab holds conversion entries for the mule internal code's * second byte, starting from 128 (0x80). each entry in the table * holds the corresponding code point for the local charset. */ void mic2latin_with_table(const unsigned char *mic, unsigned char *p, int len, int lc, int encoding, const unsigned char *tab) { unsigned char c1, c2; while (len > 0) { c1 = *mic; if (c1 == 0) report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); if (!IS_HIGHBIT_SET(c1)) { /* easy for ASCII */ *p++ = c1; mic++; len--; } else { int l = pg_mic_mblen(mic); if (len < l) report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) || (c2 = tab[mic[1] - HIGHBIT]) == 0) { report_untranslatable_char(PG_MULE_INTERNAL, encoding, (const char *) mic, len); break; /* keep compiler quiet */ } *p++ = c2; mic += 2; len -= 2; } } *p = '\0'; }
/* * MIC ---> EUC_KR */ static void mic2euc_kr(const unsigned char *mic, unsigned char *p, int len) { int c1; int l; while (len > 0) { c1 = *mic; if (!IS_HIGHBIT_SET(c1)) { /* ASCII */ if (c1 == 0) report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); *p++ = c1; mic++; len--; continue; } l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len); if (l < 0) report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); if (c1 == LC_KS5601) { *p++ = mic[1]; *p++ = mic[2]; } else report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_KR, (const char *) mic, len); mic += l; len -= l; } *p = '\0'; }
/* * ASCII ---> MIC * * While ordinarily SQL_ASCII encoding is forgiving of high-bit-set * characters, here we must take a hard line because we don't know * the appropriate MIC equivalent. */ void pg_ascii2mic(const unsigned char *l, unsigned char *p, int len) { int c1; while (len > 0) { c1 = *l; if (c1 == 0 || IS_HIGHBIT_SET(c1)) report_invalid_encoding(PG_SQL_ASCII, (const char *) l, len); *p++ = c1; l++; len--; } *p = '\0'; }
/* * LATINn ---> MIC when the charset's local codes map directly to MIC * * l points to the source string of length len * p is the output area (must be large enough!) * lc is the mule character set id for the local encoding * encoding is the PG identifier for the local encoding */ void latin2mic(const unsigned char *l, unsigned char *p, int len, int lc, int encoding) { int c1; while (len > 0) { c1 = *l; if (c1 == 0) report_invalid_encoding(encoding, (const char *) l, len); if (IS_HIGHBIT_SET(c1)) *p++ = lc; *p++ = c1; l++; len--; } *p = '\0'; }
/* * local code ---> UTF8 * * iso: input string in local encoding (need not be null-terminated) * len: length of input string (in bytes) * utf: pointer to the output area (must be large enough!) (output string will be null-terminated) * map: conversion map for single characters * mapsize: number of entries in the conversion map * cmap: conversion map for combined characters * (optional, pass NULL if none) * cmapsize: number of entries in the conversion map for combined characters * (optional, pass 0 if none) * conv_func: algorithmic encoding conversion function * (optional, pass NULL if none) * encoding: PG identifier for the local encoding * * For each character, the map is consulted first; if no match, the cmap * (if provided) is consulted next; if still no match, the conv_func * (if provided) is applied. An error is raised if no match is found. * * See pg_wchar.h for more details about the data structures used here. */ void LocalToUtf(const unsigned char *iso, int len, unsigned char *utf, const pg_local_to_utf *map, int mapsize, const pg_local_to_utf_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding) { uint32 iiso; int l; const pg_local_to_utf *p; const pg_local_to_utf_combined *cp; if (!PG_VALID_ENCODING(encoding)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid encoding number: %d", encoding))); for (; len > 0; len -= l) { /* "break" cases all represent errors */ if (*iso == '\0') break; if (!IS_HIGHBIT_SET(*iso)) { /* ASCII case is easy, assume it's one-to-one conversion */ *utf++ = *iso++; l = 1; continue; } l = pg_encoding_verifymb(encoding, (const char *) iso, len); if (l < 0) break; /* collect coded char of length l */ if (l == 1) iiso = *iso++; else if (l == 2) { iiso = *iso++ << 8; iiso |= *iso++; } else if (l == 3) { iiso = *iso++ << 16; iiso |= *iso++ << 8; iiso |= *iso++; } else if (l == 4) { iiso = *iso++ << 24; iiso |= *iso++ << 16; iiso |= *iso++ << 8; iiso |= *iso++; } else { elog(ERROR, "unsupported character length %d", l); iiso = 0; /* keep compiler quiet */ } /* First check ordinary map */ p = bsearch(&iiso, map, mapsize, sizeof(pg_local_to_utf), compare2); if (p) { utf = store_coded_char(utf, p->utf); continue; } /* If there's a combined character map, try that */ if (cmap) { cp = bsearch(&iiso, cmap, cmapsize, sizeof(pg_local_to_utf_combined), compare4); if (cp) { utf = store_coded_char(utf, cp->utf1); utf = store_coded_char(utf, cp->utf2); continue; } } /* if there's a conversion function, try that */ if (conv_func) { uint32 converted = (*conv_func) (iiso); if (converted) { utf = store_coded_char(utf, converted); continue; } } /* failed to translate this character */ report_untranslatable_char(encoding, PG_UTF8, (const char *) (iso - l), len); } /* if we broke out of loop early, must be invalid input */ if (len > 0) report_invalid_encoding(encoding, (const char *) iso, len); *utf = '\0'; }
/* ---------- * conv_proc( * INTEGER, -- source encoding id * INTEGER, -- destination encoding id * CSTRING, -- source string (null terminated C string) * CSTRING, -- destination string (null terminated C string) * INTEGER -- source string length * ) returns VOID; * ---------- */ Datum sjis_eudc_to_utf8(PG_FUNCTION_ARGS) { unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); unsigned char *p; unsigned char *fallback_character = NULL; int len = PG_GETARG_INT32(4); int sjis_len; int clen; CHECK_ENCODING_CONVERSION_ARGS(PG_SJIS, PG_UTF8); if (sjis_to_utf8 == NULL) sjis_to_utf8 = load_external_function( "utf8_and_sjis", "sjis_to_utf8", true, NULL); *dest = '\0'; p = src; sjis_len = 0; for (; len > 0; len -= clen) { const unsigned char *c = p + sjis_len; if (c[0] == '\0') report_invalid_encoding(PG_SJIS, (const char *) p + sjis_len, len); if (c[0] >= 0xf0 && c[0] <= 0xf9 && len >= 2 && ISSJISTAIL(c[1])) { int ucs; int m; int n; clen = 2; /* SJIS to UTF8 */ if (sjis_len > 0) { DirectFunctionCall5(sjis_to_utf8, PG_SJIS, PG_UTF8, CStringGetDatum(p), CStringGetDatum(dest), sjis_len); dest = dest + strlen((char *) dest); p += sjis_len; sjis_len = 0; } p += clen; elog(eudc_log_level, "eudc character found: %02x%02x in SJIS to UTF8 conversion", c[0], c[1]); /* SJIS EUDC to UTF8 */ if (eudc_fallback_character && eudc_fallback_character[0]) { /* map to fallback character */ int i; if (fallback_character == NULL) { fallback_character = pg_do_encoding_conversion( (unsigned char *) eudc_fallback_character, strlen(eudc_fallback_character), GetDatabaseEncoding(), PG_UTF8); } for (i = 0; fallback_character[i]; i++) *dest++ = fallback_character[i]; } else { /* linear mapping */ n = c[0] - 0xf0; m = c[1] - 0x40; if (m >= 0x40) m--; ucs = 0xe000 + n * 188 + m; *dest++ = (ucs >> 12) | 0xe0; *dest++ = (ucs & 0x0fc0) >> 6 | 0x80; *dest++ = (ucs & 0x003f) | 0x80; } *dest = '\0'; } else {
/* * UTF8 ---> local code * * utf: input UTF8 string (need not be null-terminated). * iso: pointer to the output area (must be large enough!) * map: the conversion map. * cmap: the conversion map for combined characters. * (optional) * size1: the size of the conversion map. * size2: the size of the conversion map for combined characters * (optional) * encoding: the PG identifier for the local encoding. * len: length of input string. */ void UtfToLocal(const unsigned char *utf, unsigned char *iso, const pg_utf_to_local *map, const pg_utf_to_local_combined *cmap, int size1, int size2, int encoding, int len) { uint32 iutf; uint32 cutf[2]; uint32 code; pg_utf_to_local *p; pg_utf_to_local_combined *cp; int l; for (; len > 0; len -= l) { /* "break" cases all represent errors */ if (*utf == '\0') break; l = pg_utf_mblen(utf); if (len < l) break; if (!pg_utf8_islegal(utf, l)) break; if (l == 1) { /* ASCII case is easy */ *iso++ = *utf++; continue; } else if (l == 2) { iutf = *utf++ << 8; iutf |= *utf++; } else if (l == 3) { iutf = *utf++ << 16; iutf |= *utf++ << 8; iutf |= *utf++; } else if (l == 4) { iutf = *utf++ << 24; iutf |= *utf++ << 16; iutf |= *utf++ << 8; iutf |= *utf++; } /* * first, try with combined map if possible */ if (cmap && len > l) { const unsigned char *utf_save = utf; int len_save = len; int l_save = l; len -= l; l = pg_utf_mblen(utf); if (len < l) break; if (!pg_utf8_islegal(utf, l)) break; cutf[0] = iutf; if (l == 1) { if (len_save > 1) { p = bsearch(&cutf[0], map, size1, sizeof(pg_utf_to_local), compare1); if (p == NULL) report_untranslatable_char(PG_UTF8, encoding, (const char *) (utf_save - l_save), len_save); iso = set_iso_code(iso, p->code); } /* ASCII case is easy */ *iso++ = *utf++; continue; } else if (l == 2) { iutf = *utf++ << 8; iutf |= *utf++; } else if (l == 3) { iutf = *utf++ << 16; iutf |= *utf++ << 8; iutf |= *utf++; } else if (l == 4) { iutf = *utf++ << 24; iutf |= *utf++ << 16; iutf |= *utf++ << 8; iutf |= *utf++; } cutf[1] = iutf; cp = bsearch(cutf, cmap, size2, sizeof(pg_utf_to_local_combined), compare3); if (cp) code = cp->code; else { /* not found in combined map. try with ordinary map */ p = bsearch(&cutf[0], map, size1, sizeof(pg_utf_to_local), compare1); if (p == NULL) report_untranslatable_char(PG_UTF8, encoding, (const char *) (utf_save - l_save), len_save); iso = set_iso_code(iso, p->code); p = bsearch(&cutf[1], map, size1, sizeof(pg_utf_to_local), compare1); if (p == NULL) report_untranslatable_char(PG_UTF8, encoding, (const char *) (utf - l), len); code = p->code; } } else /* no cmap or no remaining data */ { p = bsearch(&iutf, map, size1, sizeof(pg_utf_to_local), compare1); if (p == NULL) report_untranslatable_char(PG_UTF8, encoding, (const char *) (utf - l), len); code = p->code; } iso = set_iso_code(iso, code); } if (len > 0) report_invalid_encoding(PG_UTF8, (const char *) utf, len); *iso = '\0'; }
/* * UTF8 ---> local code * * utf: input string in UTF8 encoding (need not be null-terminated) * len: length of input string (in bytes) * iso: pointer to the output area (must be large enough!) (output string will be null-terminated) * map: conversion map for single characters * mapsize: number of entries in the conversion map * cmap: conversion map for combined characters * (optional, pass NULL if none) * cmapsize: number of entries in the conversion map for combined characters * (optional, pass 0 if none) * conv_func: algorithmic encoding conversion function * (optional, pass NULL if none) * encoding: PG identifier for the local encoding * * For each character, the cmap (if provided) is consulted first; if no match, * the map is consulted next; if still no match, the conv_func (if provided) * is applied. An error is raised if no match is found. * * See pg_wchar.h for more details about the data structures used here. */ void UtfToLocal(const unsigned char *utf, int len, unsigned char *iso, const pg_utf_to_local *map, int mapsize, const pg_utf_to_local_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding) { uint32 iutf; int l; const pg_utf_to_local *p; const pg_utf_to_local_combined *cp; if (!PG_VALID_ENCODING(encoding)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid encoding number: %d", encoding))); for (; len > 0; len -= l) { /* "break" cases all represent errors */ if (*utf == '\0') break; l = pg_utf_mblen(utf); if (len < l) break; if (!pg_utf8_islegal(utf, l)) break; if (l == 1) { /* ASCII case is easy, assume it's one-to-one conversion */ *iso++ = *utf++; continue; } /* collect coded char of length l */ if (l == 2) { iutf = *utf++ << 8; iutf |= *utf++; } else if (l == 3) { iutf = *utf++ << 16; iutf |= *utf++ << 8; iutf |= *utf++; } else if (l == 4) { iutf = *utf++ << 24; iutf |= *utf++ << 16; iutf |= *utf++ << 8; iutf |= *utf++; } else { elog(ERROR, "unsupported character length %d", l); iutf = 0; /* keep compiler quiet */ } /* First, try with combined map if possible */ if (cmap && len > l) { const unsigned char *utf_save = utf; int len_save = len; int l_save = l; /* collect next character, same as above */ len -= l; l = pg_utf_mblen(utf); if (len < l) break; if (!pg_utf8_islegal(utf, l)) break; /* We assume ASCII character cannot be in combined map */ if (l > 1) { uint32 iutf2; uint32 cutf[2]; if (l == 2) { iutf2 = *utf++ << 8; iutf2 |= *utf++; } else if (l == 3) { iutf2 = *utf++ << 16; iutf2 |= *utf++ << 8; iutf2 |= *utf++; } else if (l == 4) { iutf2 = *utf++ << 24; iutf2 |= *utf++ << 16; iutf2 |= *utf++ << 8; iutf2 |= *utf++; } else { elog(ERROR, "unsupported character length %d", l); iutf2 = 0; /* keep compiler quiet */ } cutf[0] = iutf; cutf[1] = iutf2; cp = bsearch(cutf, cmap, cmapsize, sizeof(pg_utf_to_local_combined), compare3); if (cp) { iso = store_coded_char(iso, cp->code); continue; } } /* fail, so back up to reprocess second character next time */ utf = utf_save; len = len_save; l = l_save; } /* Now check ordinary map */ p = bsearch(&iutf, map, mapsize, sizeof(pg_utf_to_local), compare1); if (p) { iso = store_coded_char(iso, p->code); continue; } /* if there's a conversion function, try that */ if (conv_func) { uint32 converted = (*conv_func) (iutf); if (converted) { iso = store_coded_char(iso, converted); continue; } } /* failed to translate this character */ report_untranslatable_char(PG_UTF8, encoding, (const char *) (utf - l), len); } /* if we broke out of loop early, must be invalid input */ if (len > 0) report_invalid_encoding(PG_UTF8, (const char *) utf, len); *iso = '\0'; }