static void utf8_print(const unsigned char *input, int length, FILE *stream) { int i = 0; while(i < length && *input) { unsigned long c; int size = raptor_unicode_utf8_string_get_char(input, length - i, &c); if(size <= 0) return; if(i) fputc(' ', stream); fprintf(stream, "U+%04X", (int)c); input += size; i += size; } }
/* * raptor_ntriples_parse_term_internal: * @world: raptor world * @locator: locator object (in/out) (or NULL) * @start: pointer to starting character of string (in) * @dest: destination of string (in) * @lenp: pointer to length of string (in/out) * @dest_lenp: pointer to length of destination string (out) * @end_char: string ending character * @class: string class * * INTERNAL - Parse an N-Triples term with escapes. * * Relies that @dest is long enough; it need only be as large as the * input string @start since when UTF-8 encoding, the escapes are * removed and the result is always less than or equal to length of * input. * * N-Triples strings / URIs are written in ASCII at present; * characters outside the printable ASCII range are discarded with a * warning. See the grammar for full details of the allowed ranges. * * UTF-8 and the \u and \U esapes are both allowed. * * Return value: Non 0 on failure **/ static int raptor_ntriples_parse_term_internal(raptor_world* world, raptor_locator* locator, const unsigned char **start, unsigned char *dest, size_t *lenp, size_t *dest_lenp, char end_char, raptor_ntriples_term_class term_class) { const unsigned char *p = *start; unsigned char c = '\0'; size_t ulen = 0; unsigned long unichar = 0; unsigned int position = 0; int end_char_seen = 0; /* find end of string, fixing backslashed characters on the way */ while(*lenp > 0) { int unichar_width; c = *p; p++; (*lenp)--; if(locator) { locator->column++; locator->byte++; } if(c > 0x7f) { /* just copy the UTF-8 bytes through */ int unichar_len; unichar_len = raptor_unicode_utf8_string_get_char(p - 1, 1 + *lenp, NULL); if(unichar_len < 0 || RAPTOR_GOOD_CAST(size_t, unichar_len) > *lenp) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "UTF-8 encoding error at character %d (0x%02X) found.", c, c); /* UTF-8 encoding had an error or ended in the middle of a string */ return 1; } memmove(dest, p-1, unichar_len); dest += unichar_len; unichar_len--; /* p, *lenp were moved on by 1 earlier */ p += unichar_len; (*lenp) -= unichar_len; if(locator) { locator->column += unichar_len; locator->byte += unichar_len; } continue; } if(c != '\\') { /* finish at non-backslashed end_char */ if(end_char && c == end_char) { end_char_seen = 1; break; } if(!raptor_ntriples_term_valid(c, position, term_class)) { if(end_char) { /* end char was expected, so finding an invalid thing is an error */ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Missing terminating '%c' (found '%c')", end_char, c); return 0; } else { /* it's the end - so rewind 1 to save next char */ p--; (*lenp)++; if(locator) { locator->column--; locator->byte--; } if(term_class == RAPTOR_TERM_CLASS_BNODEID && dest[-1] == '.') { /* If bnode id ended on '.' move back one */ dest--; p--; (*lenp)++; if(locator) { locator->column--; locator->byte--; } } break; } } /* otherwise store and move on */ *dest++ = c; position++; continue; } if(!*lenp) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "\\ at end of input."); return 0; } c = *p; p++; (*lenp)--; if(locator) { locator->column++; locator->byte++; } switch(c) { case '"': case '\\': *dest++ = c; break; case 'b': *dest++ = '\b'; break; case 'f': *dest++ = '\f'; break; case 'n': *dest++ = '\n'; break; case 'r': *dest++ = '\r'; break; case 't': *dest++ = '\t'; break; case '<': case '>': case '{': case '}': case '|': case '^': case '`': /* Turtle 2013 allows these in URIs (as well as \" and \\) */ *dest++ = c; break; case 'u': case 'U': ulen = (c == 'u') ? 4 : 8; if(*lenp < ulen) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "%c over end of input.", c); return 0; } if(1) { unsigned int ii; int n = 0; for(ii = 0; ii < ulen; ii++) { char cc = p[ii]; if(!isxdigit(RAPTOR_GOOD_CAST(char, cc))) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "N-Triples string error - illegal hex digit %c in Unicode escape '%c%s...'", cc, c, p); n = 1; break; } } if(n) break; n = sscanf((const char*)p, ((ulen == 4) ? "%04lx" : "%08lx"), &unichar); if(n != 1) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal Uncode escape '%c%s...'", c, p); break; } } p += ulen; (*lenp) -= ulen; if(locator) { locator->column += RAPTOR_GOOD_CAST(int, ulen); locator->byte += RAPTOR_GOOD_CAST(int, ulen); } if(unichar > raptor_unicode_max_codepoint) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal Unicode character with code point #x%lX (max #x%lX).", unichar, raptor_unicode_max_codepoint); break; } unichar_width = raptor_unicode_utf8_string_put_char(unichar, dest, 4); if(unichar_width < 0) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal Unicode character with code point #x%lX.", unichar); break; } /* The destination length is set here to 4 since we know that in * all cases, the UTF-8 encoded output sequence is always shorter * than the input sequence, and the buffer is edited in place. * \uXXXX: 6 bytes input - UTF-8 max 3 bytes output * \uXXXXXXXX: 10 bytes input - UTF-8 max 4 bytes output */ dest += (int)unichar_width; break; default: raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal string escape \\%c in \"%s\"", c, (char*)start); return 0; } position++; } /* end while */
/** * raptor_string_escaped_write: * @string: UTF-8 string to write * @len: length of UTF-8 string * @delim: Terminating delimiter character for string (such as " or >) or \0 for no escaping. * @flags: bit flags - see #raptor_escaped_write_bitflags * @iostr: #raptor_iostream to write to * * Write a UTF-8 string formatted using different escapes to a #raptor_iostream * * Supports writing escapes in the Python, N-Triples, Turtle, JSON, mKR, * SPARQL styles to an iostream. * * Return value: non-0 on failure such as bad UTF-8 encoding. **/ int raptor_string_escaped_write(const unsigned char *string, size_t len, const char delim, unsigned int flags, raptor_iostream *iostr) { unsigned char c; int unichar_len; raptor_unichar unichar; if(!string) return 1; for(; (c=*string); string++, len--) { if((delim && c == delim && (delim == '\'' || delim == '"')) || c == '\\') { raptor_iostream_write_byte('\\', iostr); raptor_iostream_write_byte(c, iostr); continue; } if(delim && c == delim) { raptor_iostream_counted_string_write("\\u", 2, iostr); raptor_iostream_hexadecimal_write(c, 4, iostr); continue; } if(flags & RAPTOR_ESCAPED_WRITE_BITFLAG_SPARQL_URI_ESCAPES) { /* Must escape #x00-#x20<>\"{}|^` */ if(c <= 0x20 || c == '<' || c == '>' || c == '\\' || c == '"' || c == '{' || c == '}' || c == '|' || c == '^' || c == '`') { raptor_iostream_counted_string_write("\\u", 2, iostr); raptor_iostream_hexadecimal_write(c, 4, iostr); continue; } else if(c < 0x7f) { raptor_iostream_write_byte(c, iostr); continue; } } if(flags & RAPTOR_ESCAPED_WRITE_BITFLAG_BS_ESCAPES_TNRU) { if(c == 0x09) { raptor_iostream_counted_string_write("\\t", 2, iostr); continue; } else if(c == 0x0a) { raptor_iostream_counted_string_write("\\n", 2, iostr); continue; } else if(c == 0x0d) { raptor_iostream_counted_string_write("\\r", 2, iostr); continue; } else if(c < 0x20 || c == 0x7f) { raptor_iostream_counted_string_write("\\u", 2, iostr); raptor_iostream_hexadecimal_write(c, 4, iostr); continue; } } if(flags & RAPTOR_ESCAPED_WRITE_BITFLAG_BS_ESCAPES_BF) { if(c == 0x08) { /* JSON has \b for backspace */ raptor_iostream_counted_string_write("\\b", 2, iostr); continue; } else if(c == 0x0b) { /* JSON has \f for formfeed */ raptor_iostream_counted_string_write("\\f", 2, iostr); continue; } } /* Just format remaining characters */ if(c < 0x7f) { raptor_iostream_write_byte(c, iostr); continue; } /* It is unicode */ unichar_len = raptor_unicode_utf8_string_get_char(string, len, &unichar); if(unichar_len < 0 || RAPTOR_GOOD_CAST(size_t, unichar_len) > len) /* UTF-8 encoding had an error or ended in the middle of a string */ return 1; if(flags & RAPTOR_ESCAPED_WRITE_BITFLAG_UTF8) { /* UTF-8 is allowed so no need to escape */ raptor_iostream_counted_string_write(string, unichar_len, iostr); } else { if(unichar < 0x10000) { raptor_iostream_counted_string_write("\\u", 2, iostr); raptor_iostream_hexadecimal_write(RAPTOR_GOOD_CAST(unsigned int, unichar), 4, iostr); } else { raptor_iostream_counted_string_write("\\U", 2, iostr); raptor_iostream_hexadecimal_write(RAPTOR_GOOD_CAST(unsigned int, unichar), 8, iostr); } }
/** * rasqal_escaped_name_to_utf8_string: * @src: source name string * @len: length of source name string * @dest_lenp: pointer to store result string (or NULL) * @error_handler: error handling function * @error_data: data for error handle * * Get a UTF-8 and/or \u-escaped name as UTF-8. * * If dest_lenp is not NULL, the length of the resulting string is * stored at the pointed size_t. * * Return value: new UTF-8 string or NULL on failure. */ unsigned char* rasqal_escaped_name_to_utf8_string(const unsigned char *src, size_t len, size_t *dest_lenp, int (*error_handler)(rasqal_query *error_data, const char *message, ...), rasqal_query* error_data) { const unsigned char *p=src; size_t ulen=0; unsigned long unichar=0; unsigned char *result; unsigned char *dest; unsigned char *endp; int n; result=(unsigned char*)RASQAL_MALLOC(cstring, len+1); if(!result) return NULL; dest = result; endp = result + len; /* find end of string, fixing backslashed characters on the way */ while(len > 0) { unsigned char c=*p; if(c > 0x7f) { /* just copy the UTF-8 bytes through */ size_t unichar_len = raptor_unicode_utf8_string_get_char((const unsigned char*)p, len+1, NULL); if(unichar_len > len) { if(error_handler) error_handler(error_data, "UTF-8 encoding error at character %d (0x%02X) found.", c, c); /* UTF-8 encoding had an error or ended in the middle of a string */ RASQAL_FREE(cstring, result); return NULL; } memcpy(dest, p, unichar_len); dest+= unichar_len; p += unichar_len; len -= unichar_len; continue; } p++; len--; if(c != '\\') { /* not an escape - store and move on */ *dest++=c; continue; } if(!len) { RASQAL_FREE(cstring, result); return NULL; } c = *p++; len--; switch(c) { case '"': case '\\': *dest++=c; break; case 'u': case 'U': ulen=(c == 'u') ? 4 : 8; if(len < ulen) { if(error_handler) error_handler(error_data, "%c over end of line", c); RASQAL_FREE(cstring, result); return 0; } n=sscanf((const char*)p, ((ulen == 4) ? "%04lx" : "%08lx"), &unichar); if(n != 1) { if(error_handler) error_handler(error_data, "Bad %c escape", c); break; } p+=ulen; len-=ulen; if(unichar > 0x10ffff) { if(error_handler) error_handler(error_data, "Illegal Unicode character with code point #x%lX.", unichar); break; } dest += raptor_unicode_utf8_string_put_char(unichar, dest, endp - dest); break; default: if(error_handler) error_handler(error_data, "Illegal string escape \\%c in \"%s\"", c, src); RASQAL_FREE(cstring, result); return 0; } } /* end while */ /* terminate dest, can be shorter than source */ *dest='\0'; if(dest_lenp) *dest_lenp=dest-result; return result; }
/** * raptor_string_python_write: * @string: UTF-8 string to write * @len: length of UTF-8 string * @delim: Terminating delimiter character for string (such as " or >) * or \0 for no escaping. * @flags: flags 0=N-Triples mode, 1=Turtle (allow raw UTF-8), 2=Turtle long string (allow raw UTF-8), 3=JSON * @iostr: #raptor_iostream to write to * * Write a UTF-8 string using Python-style escapes (N-Triples, Turtle, JSON) to an iostream. * * Return value: non-0 on failure such as bad UTF-8 encoding. **/ int raptor_string_python_write(const unsigned char *string, size_t len, const char delim, int flags, raptor_iostream *iostr) { unsigned char c; int unichar_len; raptor_unichar unichar; if(flags < 0 || flags > 3) return 1; for(; (c=*string); string++, len--) { if((delim && c == delim && (delim == '\'' || delim == '"')) || c == '\\') { raptor_iostream_write_byte('\\', iostr); raptor_iostream_write_byte(c, iostr); continue; } if(delim && c == delim) { raptor_iostream_counted_string_write("\\u", 2, iostr); raptor_iostream_hexadecimal_write(c, 4, iostr); continue; } if(flags != 2) { /* N-Triples, Turtle or JSON */ /* Note: NTriples is ASCII */ if(c == 0x09) { raptor_iostream_counted_string_write("\\t", 2, iostr); continue; } else if((flags == 3) && c == 0x08) { /* JSON has \b for backspace */ raptor_iostream_counted_string_write("\\b", 2, iostr); continue; } else if(c == 0x0a) { raptor_iostream_counted_string_write("\\n", 2, iostr); continue; } else if((flags == 3) && c == 0x0b) { /* JSON has \f for formfeed */ raptor_iostream_counted_string_write("\\f", 2, iostr); continue; } else if(c == 0x0d) { raptor_iostream_counted_string_write("\\r", 2, iostr); continue; } else if(c < 0x20|| c == 0x7f) { raptor_iostream_counted_string_write("\\u", 2, iostr); raptor_iostream_hexadecimal_write(c, 4, iostr); continue; } else if(c < 0x80) { raptor_iostream_write_byte(c, iostr); continue; } } else if(c < 0x80) { /* Turtle long string has no escapes except delim */ raptor_iostream_write_byte(c, iostr); continue; } /* It is unicode */ unichar_len = raptor_unicode_utf8_string_get_char(string, len, NULL); if(unichar_len < 0 || unichar_len > (int)len) /* UTF-8 encoding had an error or ended in the middle of a string */ return 1; if(flags >= 1 && flags <= 3) { /* Turtle and JSON are UTF-8 - no need to escape */ raptor_iostream_counted_string_write(string, unichar_len, iostr); } else { unichar_len = raptor_unicode_utf8_string_get_char(string, len, &unichar); if(unichar < 0x10000) { raptor_iostream_counted_string_write("\\u", 2, iostr); raptor_iostream_hexadecimal_write(unichar, 4, iostr); } else { raptor_iostream_counted_string_write("\\U", 2, iostr); raptor_iostream_hexadecimal_write(unichar, 8, iostr); } } unichar_len--; /* since loop does len-- */ string += unichar_len; len -= unichar_len; } return 0; }
/** * raptor_xml_escape_string_any: * @world: raptor world * @string: string to XML escape (UTF-8) * @len: length of string * @buffer: the buffer to use for new string (UTF-8) or NULL to just calculate expected length * @length: buffer size * @quote: optional quote character to escape for attribute content, or 0 * @xml_version: XML 1.0 (10) or XML 1.1 (11) * * Return an XML-escaped version a string. * * Follows * <ulink url="http://www.w3.org/TR/xml-c14n#ProcessingModel">Canonical XML rules on Text Nodes and Attribute Nodes</ulink> * * Both: * Replaces <literal>&</literal> and <literal><</literal> * with <literal>&amp;</literal> and <literal>&lt;</literal> * respectively, preserving other characters. * * Text Nodes: * <literal>></literal> is turned into <literal>&gt;</literal> * ##xD is turned into <literal>&##xD;</literal> * * Attribute Nodes: * <literal>></literal> is generated not <literal>&gt</literal>. * ##x9, ##xA and ##xD are turned into * <literal>&##x9;</literal>, * <literal>&##xA;</literal> and * <literal>&##xD;</literal> * entities. * * If @quote is given it can be either of '\'' or '\"' * which will be turned into <literal>&apos;</literal> or * <literal>&quot;</literal> respectively. * ASCII NUL ('\0') or any other character will not be escaped. * * If @buffer is NULL, no work is done but the size of buffer * required is returned. The output in buffer remains in UTF-8. * * If the input @string is empty, a single NUL will be written to the * buffer. * * Return value: the number of bytes required / used or <0 on failure. **/ int raptor_xml_escape_string_any(raptor_world *world, const unsigned char *string, size_t len, unsigned char *buffer, size_t length, char quote, int xml_version) { size_t l; size_t new_len = 0; const unsigned char *p; unsigned char *q; int unichar_len; raptor_unichar unichar; if(!string) return -1; RAPTOR_ASSERT_OBJECT_POINTER_RETURN_VALUE(world, raptor_world, -1); raptor_world_open(world); if(quote != '\"' && quote != '\'') quote='\0'; for(l = len, p = string; l; p++, l--) { if(*p > 0x7f) { unichar_len = raptor_unicode_utf8_string_get_char(p, l, &unichar); if(unichar_len < 0 || RAPTOR_GOOD_CAST(size_t, unichar_len) > l) { raptor_log_error(world, RAPTOR_LOG_LEVEL_ERROR, NULL, "Bad UTF-8 encoding."); return -1; } } else { unichar=*p; unichar_len = 1; } if(unichar == '&') /* & */ new_len+= 5; else if(unichar == '<' || (!quote && unichar == '>')) /* < or > */ new_len+= 4; else if(quote && unichar == (unsigned long)quote) /* ' or " */ new_len+= 6; else if(unichar == 0x0d || (quote && (unichar == 0x09 || unichar == 0x0a))) /* 
 or 	 or &xA; */ new_len+= 5; else if(unichar == 0x7f || (unichar < 0x20 && unichar != 0x09 && unichar != 0x0a)) { if(!unichar || xml_version < 11) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, NULL, "Cannot write illegal XML 1.0 character U+%6lX.", unichar); } else { /* &#xX; */ new_len+= 5; if(unichar > 0x0f) new_len++; } } else new_len+= unichar_len; unichar_len--; /* since loop does len-- */ p += unichar_len; l -= unichar_len; } if(length && new_len > length) return 0; if(!buffer) return RAPTOR_BAD_CAST(int, new_len); for(l = len, p = string, q = buffer; l; p++, l--) { if(*p > 0x7f) { unichar_len = raptor_unicode_utf8_string_get_char(p, l, &unichar); /* if the UTF-8 encoding is bad, we already did return -1 above */ } else { unichar=*p; unichar_len = 1; } if(unichar == '&') { memcpy(q, "&", 5); q+= 5; } else if(unichar == '<') { memcpy(q, "<", 4); q+= 4; } else if(!quote && unichar == '>') { memcpy(q, ">", 4); q+= 4; } else if(quote && unichar == (unsigned long)quote) { if(quote == '\'') memcpy(q, "'", 6); else memcpy(q, """, 6); q+= 6; } else if(unichar == 0x0d || (quote && (unichar == 0x09 || unichar == 0x0a))) { /* &#xX; */ *q++='&'; *q++='#'; *q++='x'; if(unichar == 0x09) *q++ = '9'; else *q++ = 'A'+ ((char)unichar-0x0a); *q++= ';'; } else if(unichar == 0x7f || (unichar < 0x20 && unichar != 0x09 && unichar != 0x0a)) { if(!unichar || xml_version < 11) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, NULL, "Cannot write illegal XML 1.0 character U+%6lX.", unichar); } else { /* &#xX; */ *q++ = '&'; *q++ = '#'; *q++ = 'x'; q += raptor_format_integer((char*)q, 3, RAPTOR_GOOD_CAST(unsigned int, unichar), /* base */ 16, -1, '\0'); *q++ = ';'; } } else { /* coverity[negative_returns] * negative unichar_len values are checked and cause return -1 above */ memcpy(q, p, unichar_len); q+= unichar_len; } unichar_len--; /* since loop does len-- */ p += unichar_len; l -= unichar_len; }