/** * raptor_format_integer: * @buffer: buffer (or NULL) * @bufsize: size of above (or 0) * @integer: integer value to format * @base: numeric base up to 36 * @width: field width (or -1) * @padding: padding char (or \0) * * INTERNAL - Format an integer as a decimal into a buffer or * calculate the size needed. * * Works Like the C99 snprintf() but just for integers. * * If @buffer is NULL or the @bufsize is too small, the number of * bytes needed (excluding NUL) is returned and no formatting is done. * * Return value: number of bytes needed or written (excluding NUL) or 0 on failure */ size_t raptor_format_integer(char* buffer, size_t bufsize, int integer, unsigned int base, int width, char padding) { size_t len = 1; char *p; unsigned int value; if(integer < 0) { value = (unsigned int)-integer; len++; width++; } else value = (unsigned int)integer; while(value /= base) len++; if(width > 0 && RAPTOR_GOOD_CAST(size_t, width) > len) len = width; if(!buffer || bufsize < RAPTOR_GOOD_CAST(size_t, (len + 1))) /* +1 for NUL */ return len; if(!padding) padding = ' '; if(integer < 0) value = (unsigned int)-integer; else value = (unsigned int)integer; p = &buffer[len]; *p-- = '\0'; while(value > 0 && p >= buffer) { *p-- = digits[value % base]; value /= base; } while(p >= buffer) *p-- = padding; if(integer < 0) *buffer = '-'; return len; }
/** * raptor_string_escaped_write: * @string: UTF-8 string to write * @len: length of UTF-8 string * @delim: Terminating delimiter character for string (such as " or >) or \0 for no escaping. * @flags: bit flags - see #raptor_escaped_write_bitflags * @iostr: #raptor_iostream to write to * * Write a UTF-8 string formatted using different escapes to a #raptor_iostream * * Supports writing escapes in the Python, N-Triples, Turtle, JSON, mKR, * SPARQL styles to an iostream. * * Return value: non-0 on failure such as bad UTF-8 encoding. **/ int raptor_string_escaped_write(const unsigned char *string, size_t len, const char delim, unsigned int flags, raptor_iostream *iostr) { unsigned char c; int unichar_len; raptor_unichar unichar; if(!string) return 1; for(; (c=*string); string++, len--) { if((delim && c == delim && (delim == '\'' || delim == '"')) || c == '\\') { raptor_iostream_write_byte('\\', iostr); raptor_iostream_write_byte(c, iostr); continue; } if(delim && c == delim) { raptor_iostream_counted_string_write("\\u", 2, iostr); raptor_iostream_hexadecimal_write(c, 4, iostr); continue; } if(flags & RAPTOR_ESCAPED_WRITE_BITFLAG_SPARQL_URI_ESCAPES) { /* Must escape #x00-#x20<>\"{}|^` */ if(c <= 0x20 || c == '<' || c == '>' || c == '\\' || c == '"' || c == '{' || c == '}' || c == '|' || c == '^' || c == '`') { raptor_iostream_counted_string_write("\\u", 2, iostr); raptor_iostream_hexadecimal_write(c, 4, iostr); continue; } else if(c < 0x7f) { raptor_iostream_write_byte(c, iostr); continue; } } if(flags & RAPTOR_ESCAPED_WRITE_BITFLAG_BS_ESCAPES_TNRU) { if(c == 0x09) { raptor_iostream_counted_string_write("\\t", 2, iostr); continue; } else if(c == 0x0a) { raptor_iostream_counted_string_write("\\n", 2, iostr); continue; } else if(c == 0x0d) { raptor_iostream_counted_string_write("\\r", 2, iostr); continue; } else if(c < 0x20 || c == 0x7f) { raptor_iostream_counted_string_write("\\u", 2, iostr); raptor_iostream_hexadecimal_write(c, 4, iostr); continue; } } if(flags & RAPTOR_ESCAPED_WRITE_BITFLAG_BS_ESCAPES_BF) { if(c == 0x08) { /* JSON has \b for backspace */ raptor_iostream_counted_string_write("\\b", 2, iostr); continue; } else if(c == 0x0b) { /* JSON has \f for formfeed */ raptor_iostream_counted_string_write("\\f", 2, iostr); continue; } } /* Just format remaining characters */ if(c < 0x7f) { raptor_iostream_write_byte(c, iostr); continue; } /* It is unicode */ unichar_len = raptor_unicode_utf8_string_get_char(string, len, &unichar); if(unichar_len < 0 || RAPTOR_GOOD_CAST(size_t, unichar_len) > len) /* UTF-8 encoding had an error or ended in the middle of a string */ return 1; if(flags & RAPTOR_ESCAPED_WRITE_BITFLAG_UTF8) { /* UTF-8 is allowed so no need to escape */ raptor_iostream_counted_string_write(string, unichar_len, iostr); } else { if(unichar < 0x10000) { raptor_iostream_counted_string_write("\\u", 2, iostr); raptor_iostream_hexadecimal_write(RAPTOR_GOOD_CAST(unsigned int, unichar), 4, iostr); } else { raptor_iostream_counted_string_write("\\U", 2, iostr); raptor_iostream_hexadecimal_write(RAPTOR_GOOD_CAST(unsigned int, unichar), 8, iostr); } }
/* * raptor_ntriples_parse_term_internal: * @world: raptor world * @locator: locator object (in/out) (or NULL) * @start: pointer to starting character of string (in) * @dest: destination of string (in) * @lenp: pointer to length of string (in/out) * @dest_lenp: pointer to length of destination string (out) * @end_char: string ending character * @class: string class * * INTERNAL - Parse an N-Triples term with escapes. * * Relies that @dest is long enough; it need only be as large as the * input string @start since when UTF-8 encoding, the escapes are * removed and the result is always less than or equal to length of * input. * * N-Triples strings / URIs are written in ASCII at present; * characters outside the printable ASCII range are discarded with a * warning. See the grammar for full details of the allowed ranges. * * UTF-8 and the \u and \U esapes are both allowed. * * Return value: Non 0 on failure **/ static int raptor_ntriples_parse_term_internal(raptor_world* world, raptor_locator* locator, const unsigned char **start, unsigned char *dest, size_t *lenp, size_t *dest_lenp, char end_char, raptor_ntriples_term_class term_class) { const unsigned char *p = *start; unsigned char c = '\0'; size_t ulen = 0; unsigned long unichar = 0; unsigned int position = 0; int end_char_seen = 0; /* find end of string, fixing backslashed characters on the way */ while(*lenp > 0) { int unichar_width; c = *p; p++; (*lenp)--; if(locator) { locator->column++; locator->byte++; } if(c > 0x7f) { /* just copy the UTF-8 bytes through */ int unichar_len; unichar_len = raptor_unicode_utf8_string_get_char(p - 1, 1 + *lenp, NULL); if(unichar_len < 0 || RAPTOR_GOOD_CAST(size_t, unichar_len) > *lenp) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "UTF-8 encoding error at character %d (0x%02X) found.", c, c); /* UTF-8 encoding had an error or ended in the middle of a string */ return 1; } memmove(dest, p-1, unichar_len); dest += unichar_len; unichar_len--; /* p, *lenp were moved on by 1 earlier */ p += unichar_len; (*lenp) -= unichar_len; if(locator) { locator->column += unichar_len; locator->byte += unichar_len; } continue; } if(c != '\\') { /* finish at non-backslashed end_char */ if(end_char && c == end_char) { end_char_seen = 1; break; } if(!raptor_ntriples_term_valid(c, position, term_class)) { if(end_char) { /* end char was expected, so finding an invalid thing is an error */ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Missing terminating '%c' (found '%c')", end_char, c); return 0; } else { /* it's the end - so rewind 1 to save next char */ p--; (*lenp)++; if(locator) { locator->column--; locator->byte--; } if(term_class == RAPTOR_TERM_CLASS_BNODEID && dest[-1] == '.') { /* If bnode id ended on '.' move back one */ dest--; p--; (*lenp)++; if(locator) { locator->column--; locator->byte--; } } break; } } /* otherwise store and move on */ *dest++ = c; position++; continue; } if(!*lenp) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "\\ at end of input."); return 0; } c = *p; p++; (*lenp)--; if(locator) { locator->column++; locator->byte++; } switch(c) { case '"': case '\\': *dest++ = c; break; case 'b': *dest++ = '\b'; break; case 'f': *dest++ = '\f'; break; case 'n': *dest++ = '\n'; break; case 'r': *dest++ = '\r'; break; case 't': *dest++ = '\t'; break; case '<': case '>': case '{': case '}': case '|': case '^': case '`': /* Turtle 2013 allows these in URIs (as well as \" and \\) */ *dest++ = c; break; case 'u': case 'U': ulen = (c == 'u') ? 4 : 8; if(*lenp < ulen) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "%c over end of input.", c); return 0; } if(1) { unsigned int ii; int n = 0; for(ii = 0; ii < ulen; ii++) { char cc = p[ii]; if(!isxdigit(RAPTOR_GOOD_CAST(char, cc))) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "N-Triples string error - illegal hex digit %c in Unicode escape '%c%s...'", cc, c, p); n = 1; break; } } if(n) break; n = sscanf((const char*)p, ((ulen == 4) ? "%04lx" : "%08lx"), &unichar); if(n != 1) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal Uncode escape '%c%s...'", c, p); break; } } p += ulen; (*lenp) -= ulen; if(locator) { locator->column += RAPTOR_GOOD_CAST(int, ulen); locator->byte += RAPTOR_GOOD_CAST(int, ulen); } if(unichar > raptor_unicode_max_codepoint) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal Unicode character with code point #x%lX (max #x%lX).", unichar, raptor_unicode_max_codepoint); break; } unichar_width = raptor_unicode_utf8_string_put_char(unichar, dest, 4); if(unichar_width < 0) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal Unicode character with code point #x%lX.", unichar); break; } /* The destination length is set here to 4 since we know that in * all cases, the UTF-8 encoded output sequence is always shorter * than the input sequence, and the buffer is edited in place. * \uXXXX: 6 bytes input - UTF-8 max 3 bytes output * \uXXXXXXXX: 10 bytes input - UTF-8 max 4 bytes output */ dest += (int)unichar_width; break; default: raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal string escape \\%c in \"%s\"", c, (char*)start); return 0; } position++; } /* end while */
/** * raptor_xml_escape_string_any: * @world: raptor world * @string: string to XML escape (UTF-8) * @len: length of string * @buffer: the buffer to use for new string (UTF-8) or NULL to just calculate expected length * @length: buffer size * @quote: optional quote character to escape for attribute content, or 0 * @xml_version: XML 1.0 (10) or XML 1.1 (11) * * Return an XML-escaped version a string. * * Follows * <ulink url="http://www.w3.org/TR/xml-c14n#ProcessingModel">Canonical XML rules on Text Nodes and Attribute Nodes</ulink> * * Both: * Replaces <literal>&</literal> and <literal><</literal> * with <literal>&amp;</literal> and <literal>&lt;</literal> * respectively, preserving other characters. * * Text Nodes: * <literal>></literal> is turned into <literal>&gt;</literal> * ##xD is turned into <literal>&##xD;</literal> * * Attribute Nodes: * <literal>></literal> is generated not <literal>&gt</literal>. * ##x9, ##xA and ##xD are turned into * <literal>&##x9;</literal>, * <literal>&##xA;</literal> and * <literal>&##xD;</literal> * entities. * * If @quote is given it can be either of '\'' or '\"' * which will be turned into <literal>&apos;</literal> or * <literal>&quot;</literal> respectively. * ASCII NUL ('\0') or any other character will not be escaped. * * If @buffer is NULL, no work is done but the size of buffer * required is returned. The output in buffer remains in UTF-8. * * If the input @string is empty, a single NUL will be written to the * buffer. * * Return value: the number of bytes required / used or <0 on failure. **/ int raptor_xml_escape_string_any(raptor_world *world, const unsigned char *string, size_t len, unsigned char *buffer, size_t length, char quote, int xml_version) { size_t l; size_t new_len = 0; const unsigned char *p; unsigned char *q; int unichar_len; raptor_unichar unichar; if(!string) return -1; RAPTOR_ASSERT_OBJECT_POINTER_RETURN_VALUE(world, raptor_world, -1); raptor_world_open(world); if(quote != '\"' && quote != '\'') quote='\0'; for(l = len, p = string; l; p++, l--) { if(*p > 0x7f) { unichar_len = raptor_unicode_utf8_string_get_char(p, l, &unichar); if(unichar_len < 0 || RAPTOR_GOOD_CAST(size_t, unichar_len) > l) { raptor_log_error(world, RAPTOR_LOG_LEVEL_ERROR, NULL, "Bad UTF-8 encoding."); return -1; } } else { unichar=*p; unichar_len = 1; } if(unichar == '&') /* & */ new_len+= 5; else if(unichar == '<' || (!quote && unichar == '>')) /* < or > */ new_len+= 4; else if(quote && unichar == (unsigned long)quote) /* ' or " */ new_len+= 6; else if(unichar == 0x0d || (quote && (unichar == 0x09 || unichar == 0x0a))) /* 
 or 	 or &xA; */ new_len+= 5; else if(unichar == 0x7f || (unichar < 0x20 && unichar != 0x09 && unichar != 0x0a)) { if(!unichar || xml_version < 11) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, NULL, "Cannot write illegal XML 1.0 character U+%6lX.", unichar); } else { /* &#xX; */ new_len+= 5; if(unichar > 0x0f) new_len++; } } else new_len+= unichar_len; unichar_len--; /* since loop does len-- */ p += unichar_len; l -= unichar_len; } if(length && new_len > length) return 0; if(!buffer) return RAPTOR_BAD_CAST(int, new_len); for(l = len, p = string, q = buffer; l; p++, l--) { if(*p > 0x7f) { unichar_len = raptor_unicode_utf8_string_get_char(p, l, &unichar); /* if the UTF-8 encoding is bad, we already did return -1 above */ } else { unichar=*p; unichar_len = 1; } if(unichar == '&') { memcpy(q, "&", 5); q+= 5; } else if(unichar == '<') { memcpy(q, "<", 4); q+= 4; } else if(!quote && unichar == '>') { memcpy(q, ">", 4); q+= 4; } else if(quote && unichar == (unsigned long)quote) { if(quote == '\'') memcpy(q, "'", 6); else memcpy(q, """, 6); q+= 6; } else if(unichar == 0x0d || (quote && (unichar == 0x09 || unichar == 0x0a))) { /* &#xX; */ *q++='&'; *q++='#'; *q++='x'; if(unichar == 0x09) *q++ = '9'; else *q++ = 'A'+ ((char)unichar-0x0a); *q++= ';'; } else if(unichar == 0x7f || (unichar < 0x20 && unichar != 0x09 && unichar != 0x0a)) { if(!unichar || xml_version < 11) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, NULL, "Cannot write illegal XML 1.0 character U+%6lX.", unichar); } else { /* &#xX; */ *q++ = '&'; *q++ = '#'; *q++ = 'x'; q += raptor_format_integer((char*)q, 3, RAPTOR_GOOD_CAST(unsigned int, unichar), /* base */ 16, -1, '\0'); *q++ = ';'; } } else { /* coverity[negative_returns] * negative unichar_len values are checked and cause return -1 above */ memcpy(q, p, unichar_len); q+= unichar_len; } unichar_len--; /* since loop does len-- */ p += unichar_len; l -= unichar_len; }