/* * The next token in the input stream is known to be a string; lex it. */ static inline void json_lex_string(JsonLexContext *lex) { char *s; int len; int hi_surrogate = -1; if (lex->strval != NULL) resetStringInfo(lex->strval); Assert(lex->input_length > 0); s = lex->token_start; len = lex->token_start - lex->input; for (;;) { s++; len++; /* Premature end of the string. */ if (len >= lex->input_length) { lex->token_terminator = s; report_invalid_token(lex); } else if (*s == '"') break; else if ((unsigned char) *s < 32) { /* Per RFC4627, these characters MUST be escaped. */ /* Since *s isn't printable, exclude it from the context string */ lex->token_terminator = s; ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail("Character with value 0x%02x must be escaped.", (unsigned char) *s), report_json_context(lex))); } else if (*s == '\\') { /* OK, we have an escape character. */ s++; len++; if (len >= lex->input_length) { lex->token_terminator = s; report_invalid_token(lex); } else if (*s == 'u') { int i; int ch = 0; for (i = 1; i <= 4; i++) { s++; len++; if (len >= lex->input_length) { lex->token_terminator = s; report_invalid_token(lex); } else if (*s >= '0' && *s <= '9') ch = (ch * 16) + (*s - '0'); else if (*s >= 'a' && *s <= 'f') ch = (ch * 16) + (*s - 'a') + 10; else if (*s >= 'A' && *s <= 'F') ch = (ch * 16) + (*s - 'A') + 10; else { lex->token_terminator = s + pg_mblen(s); ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail("\"\\u\" must be followed by four hexadecimal digits."), report_json_context(lex))); } } if (lex->strval != NULL) { char utf8str[5]; int utf8len; if (ch >= 0xd800 && ch <= 0xdbff) { if (hi_surrogate != -1) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail("Unicode high surrogate must not follow a high surrogate."), report_json_context(lex))); hi_surrogate = (ch & 0x3ff) << 10; continue; } else if (ch >= 0xdc00 && ch <= 0xdfff) { if (hi_surrogate == -1) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail("Unicode low surrogate must follow a high surrogate."), report_json_context(lex))); ch = 0x10000 + hi_surrogate + (ch & 0x3ff); hi_surrogate = -1; } if (hi_surrogate != -1) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail("Unicode low surrogate must follow a high surrogate."), report_json_context(lex))); /* * For UTF8, replace the escape sequence by the actual utf8 * character in lex->strval. Do this also for other encodings * if the escape designates an ASCII character, otherwise * raise an error. We don't ever unescape a \u0000, since that * would result in an impermissible nul byte. */ if (ch == 0) { appendStringInfoString(lex->strval, "\\u0000"); } else if (GetDatabaseEncoding() == PG_UTF8) { unicode_to_utf8(ch, (unsigned char *) utf8str); utf8len = pg_utf_mblen((unsigned char *) utf8str); appendBinaryStringInfo(lex->strval, utf8str, utf8len); } else if (ch <= 0x007f) { /* * This is the only way to designate things like a form feed * character in JSON, so it's useful in all encodings. */ appendStringInfoChar(lex->strval, (char) ch); } else { ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8."), report_json_context(lex))); } } } else if (lex->strval != NULL) { if (hi_surrogate != -1) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail("Unicode low surrogate must follow a high surrogate."), report_json_context(lex))); switch (*s) { case '"': case '\\': case '/': appendStringInfoChar(lex->strval, *s); break; case 'b': appendStringInfoChar(lex->strval, '\b'); break; case 'f': appendStringInfoChar(lex->strval, '\f'); break; case 'n': appendStringInfoChar(lex->strval, '\n'); break; case 'r': appendStringInfoChar(lex->strval, '\r'); break; case 't': appendStringInfoChar(lex->strval, '\t'); break; default: /* Not a valid string escape, so error out. */ lex->token_terminator = s + pg_mblen(s); ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail("Escape sequence \"\\%s\" is invalid.", extract_mb_char(s)), report_json_context(lex))); } } else if (strchr("\"\\/bfnrt", *s) == NULL) { /* * Simpler processing if we're not bothered about de-escaping * * It's very tempting to remove the strchr() call here and * replace it with a switch statement, but testing so far has * shown it's not a performance win. */ lex->token_terminator = s + pg_mblen(s); ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail("Escape sequence \"\\%s\" is invalid.", extract_mb_char(s)), report_json_context(lex))); } } else if (lex->strval != NULL) { if (hi_surrogate != -1) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail("Unicode low surrogate must follow a high surrogate."), report_json_context(lex))); appendStringInfoChar(lex->strval, *s); } } if (hi_surrogate != -1) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail("Unicode low surrogate must follow a high surrogate."), report_json_context(lex))); /* Hooray, we found the end of the string! */ lex->prev_token_terminator = lex->token_terminator; lex->token_terminator = s + 1; }
/* * The next token in the input stream is known to be a string; lex it. */ static void json_lex_string(JsonLexContext *lex) { char *s = lex->token_start + 1; for (s = lex->token_start + 1; *s != '"'; ++s) { /* Per RFC4627, these characters MUST be escaped. */ if (*s < 32) { /* A NUL byte marks the (premature) end of the string. */ if (*s == '\0') { lex->token_terminator = s; report_invalid_token(lex); } ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail_internal("line %d: Character \"%c\" must be escaped.", lex->line_number, *s))); } else if (*s == '\\') { /* OK, we have an escape character. */ ++s; if (*s == '\0') { lex->token_terminator = s; report_invalid_token(lex); } else if (*s == 'u') { int i; int ch = 0; for (i = 1; i <= 4; ++i) { if (s[i] == '\0') { lex->token_terminator = s + i; report_invalid_token(lex); } else if (s[i] >= '0' && s[i] <= '9') ch = (ch * 16) + (s[i] - '0'); else if (s[i] >= 'a' && s[i] <= 'f') ch = (ch * 16) + (s[i] - 'a') + 10; else if (s[i] >= 'A' && s[i] <= 'F') ch = (ch * 16) + (s[i] - 'A') + 10; else { ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail_internal("line %d: \"\\u\" must be followed by four hexadecimal digits.", lex->line_number))); } } /* Account for the four additional bytes we just parsed. */ s += 4; } else if (!strchr("\"\\/bfnrt", *s)) { /* Error out. */ ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail_internal("line %d: Invalid escape \"\\%s\".", lex->line_number, extract_mb_char(s)))); } } } /* Hooray, we found the end of the string! */ lex->token_terminator = s + 1; }
/* * The next token in the input stream is known to be a string; lex it. */ static void json_lex_string(JsonLexContext *lex) { char *s; for (s = lex->token_start + 1; *s != '"'; s++) { /* Per RFC4627, these characters MUST be escaped. */ if ((unsigned char) *s < 32) { /* A NUL byte marks the (premature) end of the string. */ if (*s == '\0') { lex->token_terminator = s; report_invalid_token(lex); } /* Since *s isn't printable, exclude it from the context string */ lex->token_terminator = s; ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail("Character with value 0x%02x must be escaped.", (unsigned char) *s), report_json_context(lex))); } else if (*s == '\\') { /* OK, we have an escape character. */ s++; if (*s == '\0') { lex->token_terminator = s; report_invalid_token(lex); } else if (*s == 'u') { int i; int ch = 0; for (i = 1; i <= 4; i++) { s++; if (*s == '\0') { lex->token_terminator = s; report_invalid_token(lex); } else if (*s >= '0' && *s <= '9') ch = (ch * 16) + (*s - '0'); else if (*s >= 'a' && *s <= 'f') ch = (ch * 16) + (*s - 'a') + 10; else if (*s >= 'A' && *s <= 'F') ch = (ch * 16) + (*s - 'A') + 10; else { lex->token_terminator = s + pg_mblen(s); ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail("\"\\u\" must be followed by four hexadecimal digits."), report_json_context(lex))); } } } else if (strchr("\"\\/bfnrt", *s) == NULL) { /* Not a valid string escape, so error out. */ lex->token_terminator = s + pg_mblen(s); ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type json"), errdetail("Escape sequence \"\\%s\" is invalid.", extract_mb_char(s)), report_json_context(lex))); } } } /* Hooray, we found the end of the string! */ lex->token_terminator = s + 1; }