Example #1
0
static void
raptor_rss_error_handler(void *arg, 
                         const char *message,
                         xmlParserSeverities severity,
                         xmlTextReaderLocatorPtr xml_locator) 
{
  raptor_parser* rdf_parser=(raptor_parser*)arg;
  raptor_locator *locator=&rdf_parser->locator;

  locator->line= -1;
  locator->column= -1;
  if(arg)
    locator->line= xmlTextReaderLocatorLineNumber(xml_locator);

  raptor_parser_error(rdf_parser, message);
}
Example #2
0
static int 
raptor_ntriples_term_valid(raptor_parser* rdf_parser,
                           unsigned char c, int position, 
                           raptor_ntriples_term_class term_class) 
{
  int result = 0;

  switch(term_class) {
    case RAPTOR_TERM_CLASS_URI:
      /* ends on > */
      result = (c != '>');
      break;
      
    case RAPTOR_TERM_CLASS_BNODEID:  
      /* ends on first non [A-Za-z][A-Za-z0-9]* */
      result = IS_ASCII_ALPHA(c);
      if(position)
        result = (result || IS_ASCII_DIGIT(c));
      break;
      
    case RAPTOR_TERM_CLASS_STRING:
      /* ends on " */
      result = (c != '"');
      break;

    case RAPTOR_TERM_CLASS_LANGUAGE:
      /* ends on first non [a-zA-Z]+ ('-' [a-zA-Z0-9]+ )? */
      result = IS_ASCII_ALPHA(c);
      if(position)
        result = (result || IS_ASCII_DIGIT(c) || c == '-');
      break;
      
    default:
      raptor_parser_error(rdf_parser, "Unknown N-Triples term class %d",
                          term_class);
  }

  return result;
}
Example #3
0
static int
raptor_guess_parse_chunk(raptor_parser* rdf_parser, 
                        const unsigned char *buffer, size_t len,
                        int is_end)
{
  raptor_guess_parser_context* guess_parser = (raptor_guess_parser_context*)rdf_parser->context;

  if(guess_parser->do_guess) {
    const unsigned char *identifier = NULL;
    const char *name;
    
    guess_parser->do_guess = 0;

    if(rdf_parser->base_uri)
      identifier = raptor_uri_as_string(rdf_parser->base_uri);
    
    name = raptor_world_guess_parser_name(rdf_parser->world,
                                          NULL, guess_parser->content_type,
                                          buffer, len, identifier);
    if(!name) {
      raptor_parser_error(rdf_parser,
                          "Failed to guess parser from content type '%s'",
                          guess_parser->content_type ? 
                          guess_parser->content_type : "(none)");
      raptor_parser_parse_abort(rdf_parser);
      if(guess_parser->parser) {
        raptor_free_parser(guess_parser->parser);
        guess_parser->parser = NULL;
      }
      return 1;
    } else {
    
#if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
      RAPTOR_DEBUG2("Guessed parser name '%s'\n", name);
#endif

      /* If there is an existing guessed parser factory present and
       * it's different from the wanted parser, free it
       */
      if(guess_parser->parser) {
        raptor_parser_factory* factory = raptor_world_get_parser_factory(rdf_parser->world, name);

        if(guess_parser->parser->factory != factory) {
          raptor_free_parser(guess_parser->parser);
          guess_parser->parser = NULL;
        }
      }
      
      if(!guess_parser->parser) {
        guess_parser->parser = raptor_new_parser(rdf_parser->world, name);
        if(!guess_parser->parser)
          return 1;
      }

      /* copy any user data to the grddl parser */
      if(raptor_parser_copy_user_state(guess_parser->parser, rdf_parser))
        return 1;
      
      if(raptor_parser_parse_start(guess_parser->parser, rdf_parser->base_uri))
        return 1;
    }
  }
  

  /* now we can pass on calls to internal guess_parser */
  return raptor_parser_parse_chunk(guess_parser->parser, buffer, len, is_end);
}
static int
raptor_ntriples_parse_chunk(raptor_parser* rdf_parser, 
                            const unsigned char *s, size_t len,
                            int is_end)
{
  unsigned char *buffer;
  unsigned char *ptr;
  unsigned char *start;
  raptor_ntriples_parser_context *ntriples_parser=(raptor_ntriples_parser_context*)rdf_parser->context;
  
#if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
  RAPTOR_DEBUG2("adding %d bytes to buffer\n", (unsigned int)len);
#endif

  /* No data?  It's the end */
  if(!len)
    return 0;

  buffer=(unsigned char*)RAPTOR_MALLOC(cstring, ntriples_parser->line_length + len + 1);
  if(!buffer) {
    raptor_parser_fatal_error(rdf_parser, "Out of memory");
    return 1;
  }

  if(ntriples_parser->line_length) {
    strncpy((char*)buffer, (const char*)ntriples_parser->line, ntriples_parser->line_length);
    RAPTOR_FREE(cstring, ntriples_parser->line);
  }

  ntriples_parser->line=buffer;

  /* move pointer to end of cdata buffer */
  ptr=buffer+ntriples_parser->line_length;

  /* adjust stored length */
  ntriples_parser->line_length += len;

  /* now write new stuff at end of cdata buffer */
  strncpy((char*)ptr, (const char*)s, len);
  ptr += len;
  *ptr = '\0';

#if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
  RAPTOR_DEBUG2("buffer now %d bytes\n", ntriples_parser->line_length);
#endif

  ptr=buffer+ntriples_parser->offset;
  while(*(start=ptr)) {
    unsigned char *line_start=ptr;
    
#if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
  RAPTOR_DEBUG3("line buffer now '%s' (offset %d)\n", ptr, ptr-(buffer+ntriples_parser->offset));
#endif

    /* skip \n when just seen \r - i.e. \r\n or CR LF */
    if(ntriples_parser->last_char == '\r' && *ptr == '\n') {
#if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
      RAPTOR_DEBUG1("skipping a \\n\n");
#endif
      ptr++;
      rdf_parser->locator.byte++;
      rdf_parser->locator.column=0;
      start=line_start=ptr;
    }

    while(*ptr && *ptr != '\n' && *ptr != '\r')
      ptr++;

    if(!*ptr)
      break;

#if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
    RAPTOR_DEBUG3("found newline \\x%02x at offset %d\n", *ptr,
                  ptr-line_start);
#endif
    ntriples_parser->last_char=*ptr;

    len=ptr-line_start;
    rdf_parser->locator.column=0;

    *ptr='\0';
    if(raptor_ntriples_parse_line(rdf_parser,line_start,len))
      return 1;
    
    rdf_parser->locator.line++;

    /* go past newline */
    ptr++;
    rdf_parser->locator.byte++;

#if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
    /* Do not peek if too far */
    if(ptr-buffer < ntriples_parser->line_length)
      RAPTOR_DEBUG2("next char is \\x%02x\n", *ptr);
    else
      RAPTOR_DEBUG1("next char unknown - end of buffer\n");
#endif
  }

  ntriples_parser->offset=start-buffer;

  len=ntriples_parser->line_length - ntriples_parser->offset;
    
  if(len) {
    /* collapse buffer */

#if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
    RAPTOR_DEBUG3("collapsing buffer from %d to %d bytes\n", ntriples_parser->line_length, (unsigned int)len);
#endif
    buffer=(unsigned char*)RAPTOR_MALLOC(cstring, len + 1);
    if(!buffer) {
      raptor_parser_fatal_error(rdf_parser, "Out of memory");
      return 1;
    }

    strncpy((char*)buffer, 
            (const char*)ntriples_parser->line+ntriples_parser->line_length-len,
            len);
    buffer[len]='\0';

    RAPTOR_FREE(cstring, ntriples_parser->line);

    ntriples_parser->line=buffer;
    ntriples_parser->line_length -= ntriples_parser->offset;
    ntriples_parser->offset=0;

#if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
    RAPTOR_DEBUG3("buffer now '%s' (%d bytes)\n", ntriples_parser->line, ntriples_parser->line_length);
#endif    
  }
  
  /* exit now, no more input */
  if(is_end) {
    if(ntriples_parser->offset != ntriples_parser->line_length) {
       raptor_parser_error(rdf_parser, "Junk at end of input.\"");
       return 1;
    }
    
    return 0;
  }
    
  return 0;
}
static int
raptor_ntriples_parse_line(raptor_parser* rdf_parser,
                           unsigned char *buffer, size_t len)
{
  int i;
  unsigned char *p;
  unsigned char *dest;
  unsigned char *terms[3];
  int terms_allocated[3];
  size_t term_lengths[3];
  raptor_ntriples_term_type term_types[3];
  size_t term_length= 0;
  unsigned char *object_literal_language=NULL;
  unsigned char *object_literal_datatype=NULL;
  int rc=0;
  
  for(i=0; i<3; i++)
    terms_allocated[i]=0;

  /* ASSERTION:
   * p always points to first char we are considering
   * p[len-1] always points to last char
   */
  
  /* Handle empty  lines */
  if(!len)
    return 0;

#if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
  RAPTOR_DEBUG3("handling line '%s' (%d bytes)\n", buffer, (unsigned int)len);
#endif
  
  p=buffer;

  while(len>0 && isspace((int)*p)) {
    p++;
    rdf_parser->locator.column++;
    rdf_parser->locator.byte++;
    len--;
  }

  /* Handle empty - all whitespace lines */
  if(!len)
    return 0;
  
  /* Handle comment lines */
  if(*p == '#')
    return 0;
  
  /* Remove trailing spaces */
  while(len>0 && isspace((int)p[len-1])) {
    p[len-1]='\0';
    len--;
  }

  /* can't be empty now - that would have been caught above */
  
  /* Check for terminating '.' */
  if(p[len-1] != '.') {
    /* Move current location to point to problem */
    rdf_parser->locator.column += len-2;
    rdf_parser->locator.byte += len-2;
    raptor_parser_error(rdf_parser, "Missing . at end of line");
    return 0;
  }

  p[len-1]='\0';
  len--;


  /* Must be triple */

  for(i=0; i<3; i++) {
    if(!len) {
      raptor_parser_error(rdf_parser, "Unexpected end of line");
      goto cleanup;
    }
    
    /* Expect either <URI> or _:name */
    if(i == 2) {
      if(*p != '<' && *p != '_' && *p != '"' && *p != 'x') {
        raptor_parser_error(rdf_parser, "Saw '%c', expected <URIref>, _:bnodeID or \"literal\"", *p);
        goto cleanup;
      }
      if(*p == 'x') {
        if(len < 4 || strncmp((const char*)p, "xml\"", 4)) {
          raptor_parser_error(rdf_parser, "Saw '%c', expected xml\"...\")", *p);
          goto cleanup;
        }
      }
    } else if(i == 1) {
      if(*p != '<') {
        raptor_parser_error(rdf_parser, "Saw '%c', expected <URIref>", *p);
        goto cleanup;
      }
    } else /* i==0 */ {
      if(*p != '<' && *p != '_') {
        raptor_parser_error(rdf_parser, "Saw '%c', expected <URIref> or _:bnodeID", *p);
        goto cleanup;
      }
    }

    switch(*p) {
      case '<':
        term_types[i]= RAPTOR_NTRIPLES_TERM_TYPE_URI_REF;
        
        dest=p;

        p++;
        len--;
        rdf_parser->locator.column++;
        rdf_parser->locator.byte++;

        if(raptor_ntriples_term(rdf_parser,
                                (const unsigned char**)&p, 
                                dest, &len, &term_length, 
                                '>', RAPTOR_TERM_CLASS_URI, 0)) {
          rc=1;
          goto cleanup;
        }
        break;

      case '"':
        term_types[i]= RAPTOR_NTRIPLES_TERM_TYPE_LITERAL;
        
        dest=p;

        p++;
        len--;
        rdf_parser->locator.column++;
        rdf_parser->locator.byte++;

        if(raptor_ntriples_term(rdf_parser,
                                (const unsigned char**)&p,
                                dest, &len, &term_length,
                                '"', RAPTOR_TERM_CLASS_STRING, 0)) {
          rc=1;
          goto cleanup;
        }
        
        if(len && (*p == '-' || *p == '@')) {
          if(*p == '-')
            raptor_parser_error(rdf_parser, "Old N-Triples language syntax using \"string\"-lang rather than \"string\"@lang.");

          object_literal_language=p;

          /* Skip - */
          p++;
          len--;
          rdf_parser->locator.column++;
          rdf_parser->locator.byte++;

          if(!len) {
            raptor_parser_error(rdf_parser, "Missing language after \"string\"-");
            goto cleanup;
          }
          

          if(raptor_ntriples_term(rdf_parser,
                                  (const unsigned char**)&p,
                                  object_literal_language, &len, NULL,
                                  '\0', RAPTOR_TERM_CLASS_LANGUAGE, 0)) {
            rc=1;
            goto cleanup;
          }
        }

        if(len >1 && *p == '^' && p[1] == '^') {

          object_literal_datatype=p;

          /* Skip ^^ */
          p+= 2;
          len-= 2;
          rdf_parser->locator.column+= 2;
          rdf_parser->locator.byte+= 2;

          if(!len || (len && *p != '<')) {
            raptor_parser_error(rdf_parser, "Missing datatype URI-ref in\"string\"^^<URI-ref> after ^^");
            goto cleanup;
          }

          p++;
          len--;
          rdf_parser->locator.column++;
          rdf_parser->locator.byte++;

          if(raptor_ntriples_term(rdf_parser,
                                  (const unsigned char**)&p,
                                  object_literal_datatype, &len, NULL,
                                  '>', RAPTOR_TERM_CLASS_URI, 0)) {
            rc=1;
            goto cleanup;
          }
          
        }

        if(object_literal_datatype && object_literal_language) {
          raptor_parser_warning(rdf_parser, "Typed literal used with a language - ignoring the language");
          object_literal_language=NULL;
        }
          

        break;


      case '_':
        term_types[i]= RAPTOR_NTRIPLES_TERM_TYPE_BLANK_NODE;

        /* store where _ was */
        dest=p;

        p++;
        len--;
        rdf_parser->locator.column++;
        rdf_parser->locator.byte++;

        if(!len || (len > 0 && *p != ':')) {
          raptor_parser_error(rdf_parser, "Illegal bNodeID - _ not followed by :");
          goto cleanup;
        }

        /* Found ':' - move on */

        p++;
        len--;
        rdf_parser->locator.column++;
        rdf_parser->locator.byte++;

        if(raptor_ntriples_term(rdf_parser,
                                (const unsigned char**)&p,
                                dest, &len, &term_length,
                                '\0', RAPTOR_TERM_CLASS_BNODEID, 0)) {
          rc=1;
          goto cleanup;
        }

        if(!term_length) {
          raptor_parser_error(rdf_parser, "Bad or missing bNodeID after _:");
          goto cleanup;
        } else {
          unsigned char *blank=(unsigned char*)RAPTOR_MALLOC(cstring, term_length+1);
          if(!blank) {
            raptor_parser_fatal_error(rdf_parser, "Out of memory");
            rc=1;
            goto cleanup;
          }
          strcpy((char*)blank, (const char*)dest);
          dest=raptor_parser_internal_generate_id(rdf_parser, 
                                                  RAPTOR_GENID_TYPE_BNODEID,
                                                  blank);
          terms_allocated[i]=1;
        }

        break;

      case 'x':

        raptor_parser_error(rdf_parser, "Old N-Triples XML using xml\"string\"-lang rather than \"string\"@lang^^<%s>.", raptor_xml_literal_datatype_uri_string);

        /* already know we have 'xml"' coming up */
        term_types[i]= RAPTOR_NTRIPLES_TERM_TYPE_LITERAL;
        
        /* 3=strlen("xml") */
        p+=3;
        len-=3;

        dest=p;

        p++;
        len--;
        rdf_parser->locator.column++;
        rdf_parser->locator.byte++;

        if(raptor_ntriples_term(rdf_parser,
                                (const unsigned char**)&p,
                                dest, &len, &term_length, 
                                '"', RAPTOR_TERM_CLASS_STRING, 0)) {
          rc=1;
          goto cleanup;
        }

        /* got XML literal string */
        object_literal_datatype=(unsigned char*)raptor_xml_literal_datatype_uri_string;

        if(len && (*p == '-' || *p == '@')) {
          if(*p == '-')
            raptor_parser_error(rdf_parser, "Old N-Triples language syntax using xml\"string\"-lang rather than xml\"string\"@lang.");

          object_literal_language=p;

          /* Skip - */
          p++;
          len--;
          rdf_parser->locator.column++;
          rdf_parser->locator.byte++;

          if(!len) {
            raptor_parser_error(rdf_parser, "Missing language in xml\"string\"-language after -");
            goto cleanup;
          }

          if(raptor_ntriples_term(rdf_parser,
                                  (const unsigned char**)&p,
                                  object_literal_language, &len, NULL,
                                  '"', RAPTOR_TERM_CLASS_STRING, 0)) {
            rc=1;
            goto cleanup;
          }
          
        }

        if(len >1 && *p == '^' && p[1] == '^') {

          object_literal_datatype=p;

          /* Skip ^^ */
          p+= 2;
          len-= 2;
          rdf_parser->locator.column+= 2;
          rdf_parser->locator.byte+= 2;

          if(!len || (len && *p != '<')) {
            raptor_parser_error(rdf_parser, "Missing datatype URI-ref in xml\"string\"^^<URI-ref> after ^^");
            goto cleanup;
          }

          p++;
          len--;
          rdf_parser->locator.column++;
          rdf_parser->locator.byte++;

          if(raptor_ntriples_term(rdf_parser,
                                  (const unsigned char**)&p,
                                  object_literal_datatype, &len, NULL,
                                  '>', RAPTOR_TERM_CLASS_URI, 0)) {
            rc=1;
            goto cleanup;
          }
          
        }

        if(len) {
          if(*p != ' ') {
            raptor_parser_error(rdf_parser, "Missing terminating ' '");
            return 0;
          }

          p++;
          len--;
          rdf_parser->locator.column++;
          rdf_parser->locator.byte++;
        }
        
        break;


      default:
        raptor_parser_fatal_error(rdf_parser, "Unknown term type");
        rc=1;
        goto cleanup;
    }


    /* Store term */
    terms[i]=dest; term_lengths[i]=term_length;

    /* Whitespace must separate the terms */
    if(i<2 && !isspace((int)*p)) {
      raptor_parser_error(rdf_parser, "Missing whitespace after term '%s'", terms[i]);
      rc=1;
      goto cleanup;
    }

    /* Skip whitespace after terms */
    while(len>0 && isspace((int)*p)) {
      p++;
      len--;
      rdf_parser->locator.column++;
      rdf_parser->locator.byte++;
    }

#if defined(RAPTOR_DEBUG) && RAPTOR_DEBUG > 1
    fprintf(stderr, "item %d: term '%s' len %d type %s\n",
            i, terms[i], (unsigned int)term_lengths[i],
            raptor_ntriples_term_as_string(term_types[i]));
#endif    
  }

  if(len) {
    raptor_parser_error(rdf_parser, "Junk before terminating \".\"");
    return 0;
  }
  

  if(object_literal_language) {
    unsigned char *q;
    /* Normalize language to lowercase
     * http://www.w3.org/TR/rdf-concepts/#dfn-language-identifier
     */
    for(q=object_literal_language; *q; q++) {
      if(IS_ASCII_UPPER(*q))
        *q=TO_ASCII_LOWER(*q);
    }
  }

  raptor_ntriples_generate_statement(rdf_parser, 
                                     terms[0], term_types[0],
                                     terms[1], term_types[1],
                                     terms[2], term_types[2],
                                     object_literal_language,
                                     object_literal_datatype);

  rdf_parser->locator.byte += len;

 cleanup:
  for(i=0; i<3; i++)
    if(terms_allocated[i])
      RAPTOR_FREE(cstring, terms[i]);

  return rc;
}
/*
 * raptor_ntriples_term - Parse an N-Triples term with escapes
 * @parser: NTriples parser
 * @start: pointer to starting character of string (in)
 * @dest: destination of string (in)
 * @lenp: pointer to length of string (in/out)
 * @dest_lenp: pointer to length of destination string (out)
 * @end_char: string ending character
 * @class: string class
 * @allow_utf8: Non-0 if UTF-8 chars are allowed in the term
 * 
 * N-Triples strings/URIs are written in ASCII at present; characters
 * outside the printable ASCII range are discarded with a warning.
 * See the grammar for full details of the allowed ranges.
 *
 * If the class is RAPTOR_TERM_CLASS_FULL, the end_char is ignored.
 *
 * UTF-8 is only allowed if allow_utf8 is non-0, otherwise the
 * string is US-ASCII and only the \u and \U esapes are allowed.
 * If enabled, both are allowed.
 *
 * Return value: Non 0 on failure
 **/
static int
raptor_ntriples_term(raptor_parser* rdf_parser, 
                     const unsigned char **start, unsigned char *dest, 
                     size_t *lenp, size_t *dest_lenp,
                     char end_char,
                     raptor_ntriples_term_class term_class,
                     int allow_utf8)
{
  const unsigned char *p=*start;
  unsigned char c='\0';
  size_t ulen=0;
  unsigned long unichar=0;
  unsigned int position=0;
  int end_char_seen=0;

  if(term_class == RAPTOR_TERM_CLASS_FULL)
    end_char='\0';
  
  /* find end of string, fixing backslashed characters on the way */
  while(*lenp > 0) {
    c = *p;

    p++;
    (*lenp)--;
    rdf_parser->locator.column++;
    rdf_parser->locator.byte++;

    if(allow_utf8) {
      if(c > 0x7f) {
        /* just copy the UTF-8 bytes through */
        size_t unichar_len=raptor_utf8_to_unicode_char(NULL, (const unsigned char*)p-1, 1+*lenp);
        if(unichar_len > *lenp) {
          raptor_parser_error(rdf_parser, "UTF-8 encoding error at character %d (0x%02X) found.", c, c);
          /* UTF-8 encoding had an error or ended in the middle of a string */
          return 1;
        }
        memcpy(dest, p-1, unichar_len);
        dest+= unichar_len;

        unichar_len--; /* p, *lenp were moved on by 1 earlier */
        
        p += unichar_len;
        (*lenp) -= unichar_len;
        rdf_parser->locator.column+= unichar_len;
        rdf_parser->locator.byte+= unichar_len;
        continue;
      }
    } else if(!IS_ASCII_PRINT(c)) {
      /* This is an ASCII check, not a printable character check 
       * so isprint() is not appropriate, since that is a locale check.
       */
      raptor_parser_error(rdf_parser, "Non-printable ASCII character %d (0x%02X) found.", c, c);
      continue;
    }
    
    if(c != '\\') {
      /* finish at non-backslashed end_char */
      if(end_char && c == end_char) {
        end_char_seen=1;
        break;
      }

      if(!raptor_ntriples_term_valid(c, position, term_class)) {
        if(end_char) {
          /* end char was expected, so finding an invalid thing is an error */
          raptor_parser_error(rdf_parser, "Missing terminating '%c' (found '%c')", end_char, c);
          return 0;
        } else {
          /* it's the end - so rewind 1 to save next char */
          p--;
          (*lenp)++;
          rdf_parser->locator.column--;
          rdf_parser->locator.byte--;
          break;
        }
      }
      
      /* otherwise store and move on */
      *dest++=c;
      position++;
      continue;
    }

    if(!*lenp) {
      if(term_class != RAPTOR_TERM_CLASS_FULL)
        raptor_parser_error(rdf_parser, "\\ at end of line");
      return 0;
    }

    c = *p;

    p++;
    (*lenp)--;
    rdf_parser->locator.column++;
    rdf_parser->locator.byte++;

    switch(c) {
      case '"':
      case '\\':
        *dest++=c;
        break;
      case 'n':
        *dest++='\n';
        break;
      case 'r':
        *dest++='\r';
        break;
      case 't':
        *dest++='\t';
        break;
      case 'u':
      case 'U':
        ulen=(c == 'u') ? 4 : 8;
        
        if(*lenp < ulen) {
          raptor_parser_error(rdf_parser, "%c over end of line", c);
          return 0;
        }

        if(1) {
          int n;

          n=sscanf((const char*)p, ((ulen == 4) ? "%04lx" : "%08lx"), &unichar);
          if(n != 1) {
            raptor_parser_error(rdf_parser, "Illegal Uncode escape '%c%s...'", c, p);
            break;
          }
        }

        p+=ulen;
        (*lenp)-=ulen;
        rdf_parser->locator.column+=ulen;
        rdf_parser->locator.byte+=ulen;
        
        if(unichar > 0x10ffff) {
          raptor_parser_error(rdf_parser, "Illegal Unicode character with code point #x%lX.", unichar);
          break;
        }
          
        dest+=raptor_unicode_char_to_utf8(unichar, dest);
        break;

      default:
        raptor_parser_error(rdf_parser,
                            "Illegal string escape \\%c in \"%s\"", c, 
                            (char*)start);
        return 0;
    }

    position++;
  } /* end while */

  
  if(end_char && !end_char_seen) {
    raptor_parser_error(rdf_parser, "Missing terminating '%c' before end of line.", end_char);
    return 1;
  }

  /* terminate dest, can be shorter than source */
  *dest='\0';

  if(dest_lenp)
    *dest_lenp=p-*start;

  *start=p;

  return 0;
}
static void
raptor_ntriples_generate_statement(raptor_parser* parser, 
                                   const unsigned char *subject,
                                   const raptor_ntriples_term_type subject_type,
                                   const unsigned char *predicate,
                                   const raptor_ntriples_term_type predicate_type,
                                   const void *object,
                                   const raptor_ntriples_term_type object_type,
                                   const unsigned char *object_literal_language,
                                   const unsigned char *object_literal_datatype)
{
  /* raptor_ntriples_parser_context *ntriples_parser=(raptor_ntriples_parser_context*)parser->context; */
  raptor_statement *statement=&parser->statement;
  raptor_uri *subject_uri=NULL;
  raptor_uri *predicate_uri=NULL;
  raptor_uri *object_uri=NULL;
  raptor_uri *datatype_uri=NULL;

  /* Two choices for subject from N-Triples */
  if(subject_type == RAPTOR_NTRIPLES_TERM_TYPE_BLANK_NODE) {
    statement->subject=subject;
    statement->subject_type=RAPTOR_IDENTIFIER_TYPE_ANONYMOUS;
  } else {
    subject_uri=raptor_new_uri_v2(parser->world, subject);
    if(!subject_uri) {
      raptor_parser_error(parser, "Could not create subject uri '%s', skipping", subject);
      goto cleanup;
    }
    statement->subject=subject_uri;
    statement->subject_type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
  }

  if(object_literal_datatype) {
    datatype_uri=raptor_new_uri_v2(parser->world, object_literal_datatype);
    if(!datatype_uri) {
      raptor_parser_error(parser, "Could not create object literal datatype uri '%s', skipping", object_literal_datatype);
      goto cleanup;
    }
    object_literal_language=NULL;
  }

  /* Predicates in N-Triples are URIs but check for bad ordinals */
  if(!strncmp((const char*)predicate, "http://www.w3.org/1999/02/22-rdf-syntax-ns#_", 44)) {
    int predicate_ordinal=raptor_check_ordinal(predicate+44);
    if(predicate_ordinal <= 0)
      raptor_parser_error(parser, "Illegal ordinal value %d in property '%s'.", predicate_ordinal, predicate);
  }
  
  predicate_uri=raptor_new_uri_v2(parser->world, predicate);
  if(!predicate_uri) {
    raptor_parser_error(parser, "Could not create predicate uri '%s', skipping", predicate);
    goto cleanup;
  }
  statement->predicate_type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
  statement->predicate=predicate_uri;
  
  /* Three choices for object from N-Triples */
  statement->object_literal_language=NULL;
  statement->object_literal_datatype=NULL;
  if(object_type == RAPTOR_NTRIPLES_TERM_TYPE_URI_REF) {
    object_uri=raptor_new_uri_v2(parser->world, (const unsigned char*)object);
    if(!object_uri) {
      raptor_parser_error(parser, "Could not create object uri '%s', skipping", (const char *)object);
      goto cleanup;
    }
    statement->object=object_uri;
    statement->object_type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
  } else if(object_type == RAPTOR_NTRIPLES_TERM_TYPE_BLANK_NODE) {
    statement->object=object;
    statement->object_type=RAPTOR_IDENTIFIER_TYPE_ANONYMOUS;
  } else { 
    statement->object_type=RAPTOR_IDENTIFIER_TYPE_LITERAL;
    statement->object=object;
    statement->object_literal_language=object_literal_language;
    statement->object_literal_datatype=datatype_uri;
  }

  if(!parser->statement_handler)
    goto cleanup;

  /* Generate the statement; or is it fact? */
  (*parser->statement_handler)(parser->user_data, statement);

  cleanup:
  if(subject_uri)
    raptor_free_uri_v2(parser->world, subject_uri);
  if(predicate_uri)
    raptor_free_uri_v2(parser->world, predicate_uri);
  if(object_uri)
    raptor_free_uri_v2(parser->world, object_uri);
  if(datatype_uri)
    raptor_free_uri_v2(parser->world, datatype_uri);
}
Example #8
0
/*
 * raptor_ntriples_term:
 * @parser: NTriples parser
 * @start: pointer to starting character of string (in)
 * @dest: destination of string (in)
 * @lenp: pointer to length of string (in/out)
 * @dest_lenp: pointer to length of destination string (out)
 * @end_char: string ending character
 * @class: string class
 *
 * Parse an N-Triples term with escapes.
 *
 * Relies that @dest is long enough; it need only be as large as the
 * input string @start since when UTF-8 encoding, the escapes are
 * removed and the result is always less than or equal to length of
 * input.
 *
 * N-Triples strings/URIs are written in ASCII at present; characters
 * outside the printable ASCII range are discarded with a warning.
 * See the grammar for full details of the allowed ranges.
 *
 * UTF-8 and the \u and \U esapes are both allowed.
 *
 * Return value: Non 0 on failure
 **/
static int
raptor_ntriples_term(raptor_parser* rdf_parser, 
                     const unsigned char **start, unsigned char *dest, 
                     size_t *lenp, size_t *dest_lenp,
                     char end_char,
                     raptor_ntriples_term_class term_class)
{
  const unsigned char *p = *start;
  unsigned char c = '\0';
  size_t ulen = 0;
  unsigned long unichar = 0;
  unsigned int position = 0;
  int end_char_seen = 0;

  /* find end of string, fixing backslashed characters on the way */
  while(*lenp > 0) {
    int unichar_width;

    c = *p;

    p++;
    (*lenp)--;
    rdf_parser->locator.column++;
    rdf_parser->locator.byte++;

    if(c > 0x7f) {
      /* just copy the UTF-8 bytes through */
      int unichar_len;
      unichar_len = raptor_unicode_utf8_string_get_char(p - 1, 1 + *lenp, NULL);
      if(unichar_len < 0 || RAPTOR_GOOD_CAST(size_t, unichar_len) > *lenp) {
        raptor_parser_error(rdf_parser, "UTF-8 encoding error at character %d (0x%02X) found.", c, c);
        /* UTF-8 encoding had an error or ended in the middle of a string */
        return 1;
      }
      memcpy(dest, p-1, unichar_len);
      dest += unichar_len;

      unichar_len--; /* p, *lenp were moved on by 1 earlier */
      
      p += unichar_len;
      (*lenp) -= unichar_len;
      rdf_parser->locator.column += unichar_len;
      rdf_parser->locator.byte += unichar_len;
      continue;
    }
    
    if(c != '\\') {
      /* finish at non-backslashed end_char */
      if(end_char && c == end_char) {
        end_char_seen = 1;
        break;
      }

      if(!raptor_ntriples_term_valid(rdf_parser, c, position, term_class)) {
        if(end_char) {
          /* end char was expected, so finding an invalid thing is an error */
          raptor_parser_error(rdf_parser, "Missing terminating '%c' (found '%c')", end_char, c);
          return 0;
        } else {
          /* it's the end - so rewind 1 to save next char */
          p--;
          (*lenp)++;
          rdf_parser->locator.column--;
          rdf_parser->locator.byte--;
          break;
        }
      }
      
      /* otherwise store and move on */
      *dest++ = c;
      position++;
      continue;
    }

    if(!*lenp) {
      raptor_parser_error(rdf_parser, "\\ at end of line");
      return 0;
    }

    c = *p;

    p++;
    (*lenp)--;
    rdf_parser->locator.column++;
    rdf_parser->locator.byte++;

    switch(c) {
      case '"':
      case '\\':
        *dest++ = c;
        break;
      case 'n':
        *dest++ = '\n';
        break;
      case 'r':
        *dest++ = '\r';
        break;
      case 't':
        *dest++ = '\t';
        break;
      case '<':
      case '>':
      case '{':
      case '}':
      case '|':
      case '^':
      case '`':
        /* Turtle 2013 allows these in URIs (as well as \" and \\) */
        *dest++ = c;
        break;

      case 'u':
      case 'U':
        ulen = (c == 'u') ? 4 : 8;
        
        if(*lenp < ulen) {
          raptor_parser_error(rdf_parser, "%c over end of line", c);
          return 0;
        }

        if(1) {
          unsigned int ii;
          int n = 0;

          for(ii = 0; ii < ulen; ii++) {
            char cc = p[ii];
            if(!isxdigit(RAPTOR_GOOD_CAST(char, cc))) {
              raptor_parser_error(rdf_parser,
                            "N-Triples string error - illegal hex digit %c in Unicode escape '%c%s...'",
                            cc, c, p);
              n = 1;
              break;
            }
          }

          if(n)
            break;
          
          n = sscanf((const char*)p, ((ulen == 4) ? "%04lx" : "%08lx"), &unichar);
          if(n != 1) {
            raptor_parser_error(rdf_parser, "Illegal Uncode escape '%c%s...'", c, p);
            break;
          }
        }

        p += ulen;
        (*lenp) -= ulen;
        rdf_parser->locator.column += RAPTOR_GOOD_CAST(int, ulen);
        rdf_parser->locator.byte += RAPTOR_GOOD_CAST(int, ulen);
        
        if(unichar > raptor_unicode_max_codepoint) {
          raptor_parser_error(rdf_parser,
                              "Illegal Unicode character with code point #x%lX (max #x%lX).",
                              unichar, raptor_unicode_max_codepoint);
          break;
        }

        unichar_width = raptor_unicode_utf8_string_put_char(unichar, dest, 4);
        if(unichar_width < 0) {
          raptor_parser_error(rdf_parser,
                              "Illegal Unicode character with code point #x%lX.", 
                              unichar);
          break;
        }

        /* The destination length is set here to 4 since we know that in 
         * all cases, the UTF-8 encoded output sequence is always shorter
         * than the input sequence, and the buffer is edited in place.
         *   \uXXXX: 6 bytes input - UTF-8 max 3 bytes output
         *   \uXXXXXXXX: 10 bytes input - UTF-8 max 4 bytes output
         */
        dest += (int)unichar_width;
        break;

      default:
        raptor_parser_error(rdf_parser,
                            "Illegal string escape \\%c in \"%s\"", c, 
                            (char*)start);
        return 0;
    }

    position++;
  } /* end while */