int utf8cmp(char *c1, char *c2) { unsigned char len1 = pg_utf_mblen((unsigned char *) c1); unsigned char len2 = pg_utf_mblen((unsigned char *) c2); Assert(len1 <= UTF_MAX_WIDTH && len2 <= UTF_MAX_WIDTH); if (len1 != len2) { return len1 > len2 ? 1 : -1; } else { unsigned char j; for (j = 0; j < len1; j++) { if (*c1 != *c2) { return *c1 > *c2 ? 1 : -1; } c1++; c2++; } return 0; } }
/* * Test if a valid number starts at 'str'. * If it does, then '*end' is set to the first character after the number. * * If 'skipWhitespace' is true, then also skip all the following whitespace * should there be any. */ bool xmlStringIsNumber(char *str, double *numValue, char **end, bool skipWhitespace) { *numValue = strtod(str, end); if (*end == str) { return false; } if (skipWhitespace) { while (**end != '\0') { if (!XNODE_WHITESPACE(*end)) { return false; } *end += pg_utf_mblen((unsigned char *) *end); } return true; } else { return true; } }
static int pg_utf8_verifier(const unsigned char *s, int len) { int l = pg_utf_mblen(s); if (len < l) return -1; if (!pg_utf8_islegal(s, l)) return -1; return l; }
Datum utf8_to_iso8859_1(PG_FUNCTION_ARGS) { unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); unsigned short c, c1; Assert(PG_GETARG_INT32(0) == PG_UTF8); Assert(PG_GETARG_INT32(1) == PG_LATIN1); Assert(len >= 0); while (len > 0) { c = *src; if (c == 0) report_invalid_encoding(PG_UTF8, (const char *) src, len); /* fast path for ASCII-subset characters */ if (!IS_HIGHBIT_SET(c)) { *dest++ = c; src++; len--; } else { int l = pg_utf_mblen(src); if (l > len || !pg_utf8_islegal(src, l)) report_invalid_encoding(PG_UTF8, (const char *) src, len); if (l != 2) report_untranslatable_char(PG_UTF8, PG_LATIN1, (const char *) src, len); c1 = src[1] & 0x3f; c = ((c & 0x1f) << 6) | c1; if (c >= 0x80 && c <= 0xff) { *dest++ = (unsigned char) c; src += 2; len -= 2; } else report_untranslatable_char(PG_UTF8, PG_LATIN1, (const char *) src, len); } } *dest = '\0'; PG_RETURN_VOID(); }
/* * Calculate the length in characters of a null-terminated UTF-8 string. * * Returns -1 if the input is not valid UTF-8. */ static int pg_utf8_string_len(const char *source) { const unsigned char *p = (const unsigned char *) source; int l; int num_chars = 0; while (*p) { l = pg_utf_mblen(p); if (!pg_utf8_islegal(p, l)) return -1; p += l; num_chars++; } return num_chars; }
Datum xmlelement(PG_FUNCTION_ARGS) { Datum nameText; ArrayType *attrs = NULL; char *elName; unsigned int nameLen, resSizeMax; unsigned int childSize = 0; char *c, *result, *resData, *resCursor, *nameDst; XMLCompNodeHdr element; XMLNodeOffset *rootOffPtr; bool nameFirstChar = true; char **attrNames = NULL; char **attrValues = NULL; char *attrValFlags = NULL; XMLNodeHdr *attrNodes = NULL; XMLNodeHdr child = NULL; char **newNds = NULL; char *newNd = NULL; unsigned int attrCount = 0; unsigned int attrsSizeTotal = 0; unsigned short childCount = 0; if (PG_ARGISNULL(0)) { elog(ERROR, "invalid element name"); } nameText = PG_GETARG_DATUM(0); elName = TextDatumGetCString(nameText); nameLen = strlen(elName); if (nameLen == 0) { elog(ERROR, "invalid element name"); } if (!PG_ARGISNULL(1)) { int *dims; Oid elType, arrType; int16 arrLen, elLen; bool elByVal, elIsNull; char elAlign; unsigned int i; attrs = PG_GETARG_ARRAYTYPE_P(1); if (ARR_NDIM(attrs) != 2) { elog(ERROR, "attributes must be passed in 2 dimensional array"); } dims = ARR_DIMS(attrs); if (dims[1] != 2) { elog(ERROR, "the second dimension of attribute array must be 2"); } attrCount = dims[0]; Assert(attrCount > 0); elType = attrs->elemtype; arrType = get_array_type(elType); arrLen = get_typlen(arrType); Assert(arrType != InvalidOid); get_typlenbyvalalign(elType, &elLen, &elByVal, &elAlign); attrNames = (char **) palloc(attrCount * sizeof(char *)); attrValues = (char **) palloc(attrCount * sizeof(char *)); attrValFlags = (bool *) palloc(attrCount * sizeof(char)); for (i = 1; i <= attrCount; i++) { int subscrName[] = {i, 1}; int subscrValue[] = {i, 2}; Datum elDatum; char *nameStr, *valueStr; bool valueHasRefs = false; elDatum = array_ref(attrs, 2, subscrName, arrLen, elLen, elByVal, elAlign, &elIsNull); if (elIsNull) { elog(ERROR, "attribute name must not be null"); } nameStr = text_to_cstring(DatumGetTextP(elDatum)); if (strlen(nameStr) == 0) { elog(ERROR, "attribute name must be a string of non-zero length"); } else { /* Check validity of characters. */ char *c = nameStr; int cWidth = pg_utf_mblen((unsigned char *) c); if (!XNODE_VALID_NAME_START(c)) { elog(ERROR, "attribute name starts with invalid character"); } do { c += cWidth; cWidth = pg_utf_mblen((unsigned char *) c); } while (XNODE_VALID_NAME_CHAR(c)); if (*c != '\0') { elog(ERROR, "invalid character in attribute name"); } } /* Check uniqueness of the attribute name. */ if (i > 1) { unsigned short j; for (j = 0; j < (i - 1); j++) { if (strcmp(nameStr, attrNames[j]) == 0) { elog(ERROR, "attribute name '%s' is not unique", nameStr); } } } elDatum = array_ref(attrs, 2, subscrValue, arrLen, elLen, elByVal, elAlign, &elIsNull); if (elIsNull) { elog(ERROR, "attribute value must not be null"); } valueStr = text_to_cstring(DatumGetTextP(elDatum)); attrValFlags[i - 1] = 0; if (strlen(valueStr) > 0) { XMLNodeParserStateData state; char *valueStrOrig = valueStr; /* Parse the value and check validity. */ initXMLParserState(&state, valueStr, true); valueStr = readXMLAttValue(&state, true, &valueHasRefs); /* * If the value contains quotation mark, then apostrophe is * the delimiter. */ if (strchr(valueStr, XNODE_CHAR_QUOTMARK) != NULL) { attrValFlags[i - 1] |= XNODE_ATTR_APOSTROPHE; } finalizeXMLParserState(&state); pfree(valueStrOrig); } attrNames[i - 1] = nameStr; attrValues[i - 1] = valueStr; if (valueHasRefs) { attrValFlags[i - 1] |= XNODE_ATTR_CONTAINS_REF; } attrsSizeTotal += sizeof(XMLNodeHdrData) + strlen(nameStr) + strlen(valueStr) + 2; } } if (!PG_ARGISNULL(2)) { Datum childNodeDatum = PG_GETARG_DATUM(2); xmlnode childRaw = (xmlnode) PG_DETOAST_DATUM(childNodeDatum); child = XNODE_ROOT(childRaw); if (child->kind == XMLNODE_DOC_FRAGMENT) { childSize = getXMLNodeSize(child, true) - getXMLNodeSize(child, false); } else { childSize = getXMLNodeSize(child, true); } } /* Make sure the element name is valid. */ c = elName; while (*c != '\0') { if ((nameFirstChar && !XNODE_VALID_NAME_START(c)) || (!nameFirstChar && !XNODE_VALID_NAME_CHAR(c))) { elog(ERROR, "unrecognized character '%c' in element name", *c); } if (nameFirstChar) { nameFirstChar = false; } c += pg_utf_mblen((unsigned char *) c); }; if (child != NULL) { if (child->kind == XMLNODE_DOC_FRAGMENT) { childCount = ((XMLCompNodeHdr) child)->children; } else { childCount = 1; } } /* * It's hard to determine the byte width of references until the copying * has finished. Therefore we assume the worst case: 4 bytes per * reference. */ resSizeMax = VARHDRSZ + attrsSizeTotal + childSize + (attrCount + childCount) * 4 + sizeof(XMLCompNodeHdrData) + nameLen + 1 + sizeof(XMLNodeOffset); result = (char *) palloc(resSizeMax); resCursor = resData = VARDATA(result); if (attrCount > 0) { /* Copy attributes. */ unsigned short i; Assert(attrNames != NULL && attrValues != NULL && attrValFlags != NULL); attrNodes = (XMLNodeHdr *) palloc(attrCount * sizeof(XMLNodeHdr)); for (i = 0; i < attrCount; i++) { XMLNodeHdr attrNode = (XMLNodeHdr) resCursor; char *name = attrNames[i]; unsigned int nameLen = strlen(name); char *value = attrValues[i]; unsigned int valueLen = strlen(value); attrNodes[i] = attrNode; attrNode->kind = XMLNODE_ATTRIBUTE; attrNode->flags = attrValFlags[i]; if (xmlAttrValueIsNumber(value)) { attrNode->flags |= XNODE_ATTR_NUMBER; } resCursor = XNODE_CONTENT(attrNode); memcpy(resCursor, name, nameLen); resCursor += nameLen; *(resCursor++) = '\0'; pfree(name); memcpy(resCursor, value, valueLen); resCursor += valueLen; *(resCursor++) = '\0'; pfree(value); } pfree(attrNames); pfree(attrValues); pfree(attrValFlags); } if (child != NULL) { XMLNodeKind k = child->kind; /* * Check if the node to be inserted is of a valid kind. If the node is * document fragment, its assumed that invalid node kinds are never * added. Otherwise we'd have to check the node fragment (recursively) * not only here. */ if (k != XMLNODE_DOC_FRAGMENT) { if (k == XMLNODE_DOC || k == XMLNODE_DTD || k == XMLNODE_ATTRIBUTE) { elog(ERROR, "the nested node must not be %s", getXMLNodeKindStr(k)); } } copyXMLNodeOrDocFragment(child, childSize, &resCursor, &newNd, &newNds); } element = (XMLCompNodeHdr) resCursor; element->common.kind = XMLNODE_ELEMENT; element->common.flags = (child == NULL) ? XNODE_EMPTY : 0; element->children = attrCount + childCount; if (childCount > 0 || attrCount > 0) { XMLNodeOffset childOff, childOffMax; char bwidth; char *refPtr; /* Save relative offset(s) of the child node(s). */ if (attrCount > 0) { childOffMax = (char *) element - resData; } else if (childCount > 0) { if (child->kind == XMLNODE_DOC_FRAGMENT) { Assert(newNds != NULL); childOffMax = (char *) element - newNds[0]; } else { childOffMax = (char *) element - newNd; } } else { childOffMax = 0; } bwidth = getXMLNodeOffsetByteWidth(childOffMax); XNODE_SET_REF_BWIDTH(element, bwidth); refPtr = XNODE_FIRST_REF(element); if (attrCount > 0) { unsigned short i; /* The attribute references first... */ for (i = 0; i < attrCount; i++) { XMLNodeHdr node = attrNodes[i]; childOff = (char *) element - (char *) node; writeXMLNodeOffset(childOff, &refPtr, bwidth, true); } pfree(attrNodes); } if (childCount > 0) { /* ...followed by those of the other children. */ if (child->kind == XMLNODE_DOC_FRAGMENT) { unsigned short i; for (i = 0; i < childCount; i++) { childOff = (char *) element - newNds[i]; writeXMLNodeOffset(childOff, &refPtr, bwidth, true); } pfree(newNds); } else { childOff = (char *) element - newNd; writeXMLNodeOffset(childOff, &refPtr, bwidth, true); } } } /* And finally set the element name. */ nameDst = XNODE_ELEMENT_NAME(element); memcpy(nameDst, elName, nameLen); nameDst[nameLen] = '\0'; resCursor = nameDst + strlen(elName) + 1; SET_VARSIZE(result, (char *) resCursor - result + sizeof(XMLNodeOffset)); rootOffPtr = XNODE_ROOT_OFFSET_PTR(result); *rootOffPtr = (char *) element - resData; PG_RETURN_POINTER(result); }
/* * pg_saslprep - Normalize a password with SASLprep. * * SASLprep requires the input to be in UTF-8 encoding, but PostgreSQL * supports many encodings, so we don't blindly assume that. pg_saslprep * will check if the input looks like valid UTF-8, and returns * SASLPREP_INVALID_UTF8 if not. * * If the string contains prohibited characters (or more precisely, if the * output string would contain prohibited characters after normalization), * returns SASLPREP_PROHIBITED. * * On success, returns SASLPREP_SUCCESS, and the normalized string in * *output. * * In frontend, the normalized string is malloc'd, and the caller is * responsible for freeing it. If an allocation fails, returns * SASLPREP_OOM. In backend, the normalized string is palloc'd instead, * and a failed allocation leads to ereport(ERROR). */ pg_saslprep_rc pg_saslprep(const char *input, char **output) { pg_wchar *input_chars = NULL; pg_wchar *output_chars = NULL; int input_size; char *result; int result_size; int count; int i; bool contains_RandALCat; unsigned char *p; pg_wchar *wp; /* Ensure we return *output as NULL on failure */ *output = NULL; /* Check that the password isn't stupendously long */ if (strlen(input) > MAX_PASSWORD_LENGTH) { #ifndef FRONTEND ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("password too long"))); #else return SASLPREP_OOM; #endif } /* * Quick check if the input is pure ASCII. An ASCII string requires no * further processing. */ if (pg_is_ascii_string(input)) { *output = STRDUP(input); if (!(*output)) goto oom; return SASLPREP_SUCCESS; } /* * Convert the input from UTF-8 to an array of Unicode codepoints. * * This also checks that the input is a legal UTF-8 string. */ input_size = pg_utf8_string_len(input); if (input_size < 0) return SASLPREP_INVALID_UTF8; input_chars = ALLOC((input_size + 1) * sizeof(pg_wchar)); if (!input_chars) goto oom; p = (unsigned char *) input; for (i = 0; i < input_size; i++) { input_chars[i] = utf8_to_unicode(p); p += pg_utf_mblen(p); } input_chars[i] = (pg_wchar) '\0'; /* * The steps below correspond to the steps listed in [RFC3454], Section * "2. Preparation Overview" */ /* * 1) Map -- For each character in the input, check if it has a mapping * and, if so, replace it with its mapping. */ count = 0; for (i = 0; i < input_size; i++) { pg_wchar code = input_chars[i]; if (IS_CODE_IN_TABLE(code, non_ascii_space_ranges)) input_chars[count++] = 0x0020; else if (IS_CODE_IN_TABLE(code, commonly_mapped_to_nothing_ranges)) { /* map to nothing */ } else input_chars[count++] = code; } input_chars[count] = (pg_wchar) '\0'; input_size = count; if (input_size == 0) goto prohibited; /* don't allow empty password */ /* * 2) Normalize -- Normalize the result of step 1 using Unicode * normalization. */ output_chars = unicode_normalize_kc(input_chars); if (!output_chars) goto oom; /* * 3) Prohibit -- Check for any characters that are not allowed in the * output. If any are found, return an error. */ for (i = 0; i < input_size; i++) { pg_wchar code = input_chars[i]; if (IS_CODE_IN_TABLE(code, prohibited_output_ranges)) goto prohibited; if (IS_CODE_IN_TABLE(code, unassigned_codepoint_ranges)) goto prohibited; } /* * 4) Check bidi -- Possibly check for right-to-left characters, and if * any are found, make sure that the whole string satisfies the * requirements for bidirectional strings. If the string does not satisfy * the requirements for bidirectional strings, return an error. * * [RFC3454], Section "6. Bidirectional Characters" explains in more * detail what that means: * * "In any profile that specifies bidirectional character handling, all * three of the following requirements MUST be met: * * 1) The characters in section 5.8 MUST be prohibited. * * 2) If a string contains any RandALCat character, the string MUST NOT * contain any LCat character. * * 3) If a string contains any RandALCat character, a RandALCat character * MUST be the first character of the string, and a RandALCat character * MUST be the last character of the string." */ contains_RandALCat = false; for (i = 0; i < input_size; i++) { pg_wchar code = input_chars[i]; if (IS_CODE_IN_TABLE(code, RandALCat_codepoint_ranges)) { contains_RandALCat = true; break; } } if (contains_RandALCat) { pg_wchar first = input_chars[0]; pg_wchar last = input_chars[input_size - 1]; for (i = 0; i < input_size; i++) { pg_wchar code = input_chars[i]; if (IS_CODE_IN_TABLE(code, LCat_codepoint_ranges)) goto prohibited; } if (!IS_CODE_IN_TABLE(first, RandALCat_codepoint_ranges) || !IS_CODE_IN_TABLE(last, RandALCat_codepoint_ranges)) goto prohibited; } /* * Finally, convert the result back to UTF-8. */ result_size = 0; for (wp = output_chars; *wp; wp++) { unsigned char buf[4]; unicode_to_utf8(*wp, buf); result_size += pg_utf_mblen(buf); } result = ALLOC(result_size + 1); if (!result) goto oom; /* * There are no error exits below here, so the error exit paths don't need * to worry about possibly freeing "result". */ p = (unsigned char *) result; for (wp = output_chars; *wp; wp++) { unicode_to_utf8(*wp, p); p += pg_utf_mblen(p); } Assert((char *) p == result + result_size); *p = '\0'; FREE(input_chars); FREE(output_chars); *output = result; return SASLPREP_SUCCESS; prohibited: if (input_chars) FREE(input_chars); if (output_chars) FREE(output_chars); return SASLPREP_PROHIBITED; oom: if (input_chars) FREE(input_chars); if (output_chars) FREE(output_chars); return SASLPREP_OOM; }
/* * The next token in the input stream is known to be a string; lex it. */ static inline void json_lex_string(JsonLexContext *lex) { char *s; int len; int hi_surrogate = -1; if (lex->strval != NULL) resetStringInfo(lex->strval); Assert(lex->input_length > 0); s = lex->token_start; len = lex->token_start - lex->input; for (;;) { s++; len++; /* Premature end of the string. */ if (len >= lex->input_length) { lex->token_terminator = s; report_invalid_token(lex); } else if (*s == '"') break; else if ((unsigned char) *s < 32) { /* Per RFC4627, these characters MUST be escaped. */ /* Since *s isn't printable, exclude it from the context string */ lex->token_terminator = s; ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail("Character with value 0x%02x must be escaped.", (unsigned char) *s), report_json_context(lex))); } else if (*s == '\\') { /* OK, we have an escape character. */ s++; len++; if (len >= lex->input_length) { lex->token_terminator = s; report_invalid_token(lex); } else if (*s == 'u') { int i; int ch = 0; for (i = 1; i <= 4; i++) { s++; len++; if (len >= lex->input_length) { lex->token_terminator = s; report_invalid_token(lex); } else if (*s >= '0' && *s <= '9') ch = (ch * 16) + (*s - '0'); else if (*s >= 'a' && *s <= 'f') ch = (ch * 16) + (*s - 'a') + 10; else if (*s >= 'A' && *s <= 'F') ch = (ch * 16) + (*s - 'A') + 10; else { lex->token_terminator = s + pg_mblen(s); ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail("\"\\u\" must be followed by four hexadecimal digits."), report_json_context(lex))); } } if (lex->strval != NULL) { char utf8str[5]; int utf8len; if (ch >= 0xd800 && ch <= 0xdbff) { if (hi_surrogate != -1) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail("Unicode high surrogate must not follow a high surrogate."), report_json_context(lex))); hi_surrogate = (ch & 0x3ff) << 10; continue; } else if (ch >= 0xdc00 && ch <= 0xdfff) { if (hi_surrogate == -1) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail("Unicode low surrogate must follow a high surrogate."), report_json_context(lex))); ch = 0x10000 + hi_surrogate + (ch & 0x3ff); hi_surrogate = -1; } if (hi_surrogate != -1) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail("Unicode low surrogate must follow a high surrogate."), report_json_context(lex))); /* * For UTF8, replace the escape sequence by the actual utf8 * character in lex->strval. Do this also for other encodings * if the escape designates an ASCII character, otherwise * raise an error. We don't ever unescape a \u0000, since that * would result in an impermissible nul byte. */ if (ch == 0) { appendStringInfoString(lex->strval, "\\u0000"); } else if (GetDatabaseEncoding() == PG_UTF8) { unicode_to_utf8(ch, (unsigned char *) utf8str); utf8len = pg_utf_mblen((unsigned char *) utf8str); appendBinaryStringInfo(lex->strval, utf8str, utf8len); } else if (ch <= 0x007f) { /* * This is the only way to designate things like a form feed * character in JSON, so it's useful in all encodings. */ appendStringInfoChar(lex->strval, (char) ch); } else { ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8."), report_json_context(lex))); } } } else if (lex->strval != NULL) { if (hi_surrogate != -1) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail("Unicode low surrogate must follow a high surrogate."), report_json_context(lex))); switch (*s) { case '"': case '\\': case '/': appendStringInfoChar(lex->strval, *s); break; case 'b': appendStringInfoChar(lex->strval, '\b'); break; case 'f': appendStringInfoChar(lex->strval, '\f'); break; case 'n': appendStringInfoChar(lex->strval, '\n'); break; case 'r': appendStringInfoChar(lex->strval, '\r'); break; case 't': appendStringInfoChar(lex->strval, '\t'); break; default: /* Not a valid string escape, so error out. */ lex->token_terminator = s + pg_mblen(s); ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail("Escape sequence \"\\%s\" is invalid.", extract_mb_char(s)), report_json_context(lex))); } } else if (strchr("\"\\/bfnrt", *s) == NULL) { /* * Simpler processing if we're not bothered about de-escaping * * It's very tempting to remove the strchr() call here and * replace it with a switch statement, but testing so far has * shown it's not a performance win. */ lex->token_terminator = s + pg_mblen(s); ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail("Escape sequence \"\\%s\" is invalid.", extract_mb_char(s)), report_json_context(lex))); } } else if (lex->strval != NULL) { if (hi_surrogate != -1) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail("Unicode low surrogate must follow a high surrogate."), report_json_context(lex))); appendStringInfoChar(lex->strval, *s); } } if (hi_surrogate != -1) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail("Unicode low surrogate must follow a high surrogate."), report_json_context(lex))); /* Hooray, we found the end of the string! */ lex->prev_token_terminator = lex->token_terminator; lex->token_terminator = s + 1; }
/* * UTF8 ---> local code * * utf: input UTF8 string (need not be null-terminated). * iso: pointer to the output area (must be large enough!) * map: the conversion map. * cmap: the conversion map for combined characters. * (optional) * size1: the size of the conversion map. * size2: the size of the conversion map for combined characters * (optional) * encoding: the PG identifier for the local encoding. * len: length of input string. */ void UtfToLocal(const unsigned char *utf, unsigned char *iso, const pg_utf_to_local *map, const pg_utf_to_local_combined *cmap, int size1, int size2, int encoding, int len) { uint32 iutf; uint32 cutf[2]; uint32 code; pg_utf_to_local *p; pg_utf_to_local_combined *cp; int l; for (; len > 0; len -= l) { /* "break" cases all represent errors */ if (*utf == '\0') break; l = pg_utf_mblen(utf); if (len < l) break; if (!pg_utf8_islegal(utf, l)) break; if (l == 1) { /* ASCII case is easy */ *iso++ = *utf++; continue; } else if (l == 2) { iutf = *utf++ << 8; iutf |= *utf++; } else if (l == 3) { iutf = *utf++ << 16; iutf |= *utf++ << 8; iutf |= *utf++; } else if (l == 4) { iutf = *utf++ << 24; iutf |= *utf++ << 16; iutf |= *utf++ << 8; iutf |= *utf++; } /* * first, try with combined map if possible */ if (cmap && len > l) { const unsigned char *utf_save = utf; int len_save = len; int l_save = l; len -= l; l = pg_utf_mblen(utf); if (len < l) break; if (!pg_utf8_islegal(utf, l)) break; cutf[0] = iutf; if (l == 1) { if (len_save > 1) { p = bsearch(&cutf[0], map, size1, sizeof(pg_utf_to_local), compare1); if (p == NULL) report_untranslatable_char(PG_UTF8, encoding, (const char *) (utf_save - l_save), len_save); iso = set_iso_code(iso, p->code); } /* ASCII case is easy */ *iso++ = *utf++; continue; } else if (l == 2) { iutf = *utf++ << 8; iutf |= *utf++; } else if (l == 3) { iutf = *utf++ << 16; iutf |= *utf++ << 8; iutf |= *utf++; } else if (l == 4) { iutf = *utf++ << 24; iutf |= *utf++ << 16; iutf |= *utf++ << 8; iutf |= *utf++; } cutf[1] = iutf; cp = bsearch(cutf, cmap, size2, sizeof(pg_utf_to_local_combined), compare3); if (cp) code = cp->code; else { /* not found in combined map. try with ordinary map */ p = bsearch(&cutf[0], map, size1, sizeof(pg_utf_to_local), compare1); if (p == NULL) report_untranslatable_char(PG_UTF8, encoding, (const char *) (utf_save - l_save), len_save); iso = set_iso_code(iso, p->code); p = bsearch(&cutf[1], map, size1, sizeof(pg_utf_to_local), compare1); if (p == NULL) report_untranslatable_char(PG_UTF8, encoding, (const char *) (utf - l), len); code = p->code; } } else /* no cmap or no remaining data */ { p = bsearch(&iutf, map, size1, sizeof(pg_utf_to_local), compare1); if (p == NULL) report_untranslatable_char(PG_UTF8, encoding, (const char *) (utf - l), len); code = p->code; } iso = set_iso_code(iso, code); } if (len > 0) report_invalid_encoding(PG_UTF8, (const char *) utf, len); *iso = '\0'; }
/* * UTF8 ---> local code * * utf: input string in UTF8 encoding (need not be null-terminated) * len: length of input string (in bytes) * iso: pointer to the output area (must be large enough!) (output string will be null-terminated) * map: conversion map for single characters * mapsize: number of entries in the conversion map * cmap: conversion map for combined characters * (optional, pass NULL if none) * cmapsize: number of entries in the conversion map for combined characters * (optional, pass 0 if none) * conv_func: algorithmic encoding conversion function * (optional, pass NULL if none) * encoding: PG identifier for the local encoding * * For each character, the cmap (if provided) is consulted first; if no match, * the map is consulted next; if still no match, the conv_func (if provided) * is applied. An error is raised if no match is found. * * See pg_wchar.h for more details about the data structures used here. */ void UtfToLocal(const unsigned char *utf, int len, unsigned char *iso, const pg_utf_to_local *map, int mapsize, const pg_utf_to_local_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, int encoding) { uint32 iutf; int l; const pg_utf_to_local *p; const pg_utf_to_local_combined *cp; if (!PG_VALID_ENCODING(encoding)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid encoding number: %d", encoding))); for (; len > 0; len -= l) { /* "break" cases all represent errors */ if (*utf == '\0') break; l = pg_utf_mblen(utf); if (len < l) break; if (!pg_utf8_islegal(utf, l)) break; if (l == 1) { /* ASCII case is easy, assume it's one-to-one conversion */ *iso++ = *utf++; continue; } /* collect coded char of length l */ if (l == 2) { iutf = *utf++ << 8; iutf |= *utf++; } else if (l == 3) { iutf = *utf++ << 16; iutf |= *utf++ << 8; iutf |= *utf++; } else if (l == 4) { iutf = *utf++ << 24; iutf |= *utf++ << 16; iutf |= *utf++ << 8; iutf |= *utf++; } else { elog(ERROR, "unsupported character length %d", l); iutf = 0; /* keep compiler quiet */ } /* First, try with combined map if possible */ if (cmap && len > l) { const unsigned char *utf_save = utf; int len_save = len; int l_save = l; /* collect next character, same as above */ len -= l; l = pg_utf_mblen(utf); if (len < l) break; if (!pg_utf8_islegal(utf, l)) break; /* We assume ASCII character cannot be in combined map */ if (l > 1) { uint32 iutf2; uint32 cutf[2]; if (l == 2) { iutf2 = *utf++ << 8; iutf2 |= *utf++; } else if (l == 3) { iutf2 = *utf++ << 16; iutf2 |= *utf++ << 8; iutf2 |= *utf++; } else if (l == 4) { iutf2 = *utf++ << 24; iutf2 |= *utf++ << 16; iutf2 |= *utf++ << 8; iutf2 |= *utf++; } else { elog(ERROR, "unsupported character length %d", l); iutf2 = 0; /* keep compiler quiet */ } cutf[0] = iutf; cutf[1] = iutf2; cp = bsearch(cutf, cmap, cmapsize, sizeof(pg_utf_to_local_combined), compare3); if (cp) { iso = store_coded_char(iso, cp->code); continue; } } /* fail, so back up to reprocess second character next time */ utf = utf_save; len = len_save; l = l_save; } /* Now check ordinary map */ p = bsearch(&iutf, map, mapsize, sizeof(pg_utf_to_local), compare1); if (p) { iso = store_coded_char(iso, p->code); continue; } /* if there's a conversion function, try that */ if (conv_func) { uint32 converted = (*conv_func) (iutf); if (converted) { iso = store_coded_char(iso, converted); continue; } } /* failed to translate this character */ report_untranslatable_char(PG_UTF8, encoding, (const char *) (utf - l), len); } /* if we broke out of loop early, must be invalid input */ if (len > 0) report_invalid_encoding(PG_UTF8, (const char *) utf, len); *iso = '\0'; }