/* * raptor_ntriples_parse_term_internal: * @world: raptor world * @locator: locator object (in/out) (or NULL) * @start: pointer to starting character of string (in) * @dest: destination of string (in) * @lenp: pointer to length of string (in/out) * @dest_lenp: pointer to length of destination string (out) * @end_char: string ending character * @class: string class * * INTERNAL - Parse an N-Triples term with escapes. * * Relies that @dest is long enough; it need only be as large as the * input string @start since when UTF-8 encoding, the escapes are * removed and the result is always less than or equal to length of * input. * * N-Triples strings / URIs are written in ASCII at present; * characters outside the printable ASCII range are discarded with a * warning. See the grammar for full details of the allowed ranges. * * UTF-8 and the \u and \U esapes are both allowed. * * Return value: Non 0 on failure **/ static int raptor_ntriples_parse_term_internal(raptor_world* world, raptor_locator* locator, const unsigned char **start, unsigned char *dest, size_t *lenp, size_t *dest_lenp, char end_char, raptor_ntriples_term_class term_class) { const unsigned char *p = *start; unsigned char c = '\0'; size_t ulen = 0; unsigned long unichar = 0; unsigned int position = 0; int end_char_seen = 0; /* find end of string, fixing backslashed characters on the way */ while(*lenp > 0) { int unichar_width; c = *p; p++; (*lenp)--; if(locator) { locator->column++; locator->byte++; } if(c > 0x7f) { /* just copy the UTF-8 bytes through */ int unichar_len; unichar_len = raptor_unicode_utf8_string_get_char(p - 1, 1 + *lenp, NULL); if(unichar_len < 0 || RAPTOR_GOOD_CAST(size_t, unichar_len) > *lenp) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "UTF-8 encoding error at character %d (0x%02X) found.", c, c); /* UTF-8 encoding had an error or ended in the middle of a string */ return 1; } memmove(dest, p-1, unichar_len); dest += unichar_len; unichar_len--; /* p, *lenp were moved on by 1 earlier */ p += unichar_len; (*lenp) -= unichar_len; if(locator) { locator->column += unichar_len; locator->byte += unichar_len; } continue; } if(c != '\\') { /* finish at non-backslashed end_char */ if(end_char && c == end_char) { end_char_seen = 1; break; } if(!raptor_ntriples_term_valid(c, position, term_class)) { if(end_char) { /* end char was expected, so finding an invalid thing is an error */ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Missing terminating '%c' (found '%c')", end_char, c); return 0; } else { /* it's the end - so rewind 1 to save next char */ p--; (*lenp)++; if(locator) { locator->column--; locator->byte--; } if(term_class == RAPTOR_TERM_CLASS_BNODEID && dest[-1] == '.') { /* If bnode id ended on '.' move back one */ dest--; p--; (*lenp)++; if(locator) { locator->column--; locator->byte--; } } break; } } /* otherwise store and move on */ *dest++ = c; position++; continue; } if(!*lenp) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "\\ at end of input."); return 0; } c = *p; p++; (*lenp)--; if(locator) { locator->column++; locator->byte++; } switch(c) { case '"': case '\\': *dest++ = c; break; case 'b': *dest++ = '\b'; break; case 'f': *dest++ = '\f'; break; case 'n': *dest++ = '\n'; break; case 'r': *dest++ = '\r'; break; case 't': *dest++ = '\t'; break; case '<': case '>': case '{': case '}': case '|': case '^': case '`': /* Turtle 2013 allows these in URIs (as well as \" and \\) */ *dest++ = c; break; case 'u': case 'U': ulen = (c == 'u') ? 4 : 8; if(*lenp < ulen) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "%c over end of input.", c); return 0; } if(1) { unsigned int ii; int n = 0; for(ii = 0; ii < ulen; ii++) { char cc = p[ii]; if(!isxdigit(RAPTOR_GOOD_CAST(char, cc))) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "N-Triples string error - illegal hex digit %c in Unicode escape '%c%s...'", cc, c, p); n = 1; break; } } if(n) break; n = sscanf((const char*)p, ((ulen == 4) ? "%04lx" : "%08lx"), &unichar); if(n != 1) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal Uncode escape '%c%s...'", c, p); break; } } p += ulen; (*lenp) -= ulen; if(locator) { locator->column += RAPTOR_GOOD_CAST(int, ulen); locator->byte += RAPTOR_GOOD_CAST(int, ulen); } if(unichar > raptor_unicode_max_codepoint) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal Unicode character with code point #x%lX (max #x%lX).", unichar, raptor_unicode_max_codepoint); break; } unichar_width = raptor_unicode_utf8_string_put_char(unichar, dest, 4); if(unichar_width < 0) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal Unicode character with code point #x%lX.", unichar); break; } /* The destination length is set here to 4 since we know that in * all cases, the UTF-8 encoded output sequence is always shorter * than the input sequence, and the buffer is edited in place. * \uXXXX: 6 bytes input - UTF-8 max 3 bytes output * \uXXXXXXXX: 10 bytes input - UTF-8 max 4 bytes output */ dest += (int)unichar_width; break; default: raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal string escape \\%c in \"%s\"", c, (char*)start); return 0; } position++; } /* end while */
/* * raptor_ntriples_term - Parse an N-Triples term with escapes * @parser: NTriples parser * @start: pointer to starting character of string (in) * @dest: destination of string (in) * @lenp: pointer to length of string (in/out) * @dest_lenp: pointer to length of destination string (out) * @end_char: string ending character * @class: string class * @allow_utf8: Non-0 if UTF-8 chars are allowed in the term * * N-Triples strings/URIs are written in ASCII at present; characters * outside the printable ASCII range are discarded with a warning. * See the grammar for full details of the allowed ranges. * * If the class is RAPTOR_TERM_CLASS_FULL, the end_char is ignored. * * UTF-8 is only allowed if allow_utf8 is non-0, otherwise the * string is US-ASCII and only the \u and \U esapes are allowed. * If enabled, both are allowed. * * Return value: Non 0 on failure **/ static int raptor_ntriples_term(raptor_parser* rdf_parser, const unsigned char **start, unsigned char *dest, size_t *lenp, size_t *dest_lenp, char end_char, raptor_ntriples_term_class term_class, int allow_utf8) { const unsigned char *p=*start; unsigned char c='\0'; size_t ulen=0; unsigned long unichar=0; unsigned int position=0; int end_char_seen=0; if(term_class == RAPTOR_TERM_CLASS_FULL) end_char='\0'; /* find end of string, fixing backslashed characters on the way */ while(*lenp > 0) { c = *p; p++; (*lenp)--; rdf_parser->locator.column++; rdf_parser->locator.byte++; if(allow_utf8) { if(c > 0x7f) { /* just copy the UTF-8 bytes through */ size_t unichar_len=raptor_utf8_to_unicode_char(NULL, (const unsigned char*)p-1, 1+*lenp); if(unichar_len > *lenp) { raptor_parser_error(rdf_parser, "UTF-8 encoding error at character %d (0x%02X) found.", c, c); /* UTF-8 encoding had an error or ended in the middle of a string */ return 1; } memcpy(dest, p-1, unichar_len); dest+= unichar_len; unichar_len--; /* p, *lenp were moved on by 1 earlier */ p += unichar_len; (*lenp) -= unichar_len; rdf_parser->locator.column+= unichar_len; rdf_parser->locator.byte+= unichar_len; continue; } } else if(!IS_ASCII_PRINT(c)) { /* This is an ASCII check, not a printable character check * so isprint() is not appropriate, since that is a locale check. */ raptor_parser_error(rdf_parser, "Non-printable ASCII character %d (0x%02X) found.", c, c); continue; } if(c != '\\') { /* finish at non-backslashed end_char */ if(end_char && c == end_char) { end_char_seen=1; break; } if(!raptor_ntriples_term_valid(c, position, term_class)) { if(end_char) { /* end char was expected, so finding an invalid thing is an error */ raptor_parser_error(rdf_parser, "Missing terminating '%c' (found '%c')", end_char, c); return 0; } else { /* it's the end - so rewind 1 to save next char */ p--; (*lenp)++; rdf_parser->locator.column--; rdf_parser->locator.byte--; break; } } /* otherwise store and move on */ *dest++=c; position++; continue; } if(!*lenp) { if(term_class != RAPTOR_TERM_CLASS_FULL) raptor_parser_error(rdf_parser, "\\ at end of line"); return 0; } c = *p; p++; (*lenp)--; rdf_parser->locator.column++; rdf_parser->locator.byte++; switch(c) { case '"': case '\\': *dest++=c; break; case 'n': *dest++='\n'; break; case 'r': *dest++='\r'; break; case 't': *dest++='\t'; break; case 'u': case 'U': ulen=(c == 'u') ? 4 : 8; if(*lenp < ulen) { raptor_parser_error(rdf_parser, "%c over end of line", c); return 0; } if(1) { int n; n=sscanf((const char*)p, ((ulen == 4) ? "%04lx" : "%08lx"), &unichar); if(n != 1) { raptor_parser_error(rdf_parser, "Illegal Uncode escape '%c%s...'", c, p); break; } } p+=ulen; (*lenp)-=ulen; rdf_parser->locator.column+=ulen; rdf_parser->locator.byte+=ulen; if(unichar > 0x10ffff) { raptor_parser_error(rdf_parser, "Illegal Unicode character with code point #x%lX.", unichar); break; } dest+=raptor_unicode_char_to_utf8(unichar, dest); break; default: raptor_parser_error(rdf_parser, "Illegal string escape \\%c in \"%s\"", c, (char*)start); return 0; } position++; } /* end while */ if(end_char && !end_char_seen) { raptor_parser_error(rdf_parser, "Missing terminating '%c' before end of line.", end_char); return 1; } /* terminate dest, can be shorter than source */ *dest='\0'; if(dest_lenp) *dest_lenp=p-*start; *start=p; return 0; }