Beispiel #1
0
/*
 * decode_to_utf8:
 * @utf8_string: destination utf8 buffer (FIXME big enough!)
 * @unicode_string: first char of string
 * @end: last char of unicode_string
 */
static int
decode_to_utf8(unsigned char *utf8_string, size_t utf8_string_length,
               const char *unicode_string, const char *end)
{
  unsigned char *u = utf8_string;
  const char *p = unicode_string;
  
#ifdef RAPTOR_NFC_DECODE_DEBUG
  fputs("decode_to_utf8: string '", stderr);
  (void)fwrite(unicode_string, sizeof(char), (end-unicode_string) + 1, stderr);
  fputs("' converts to:\n  ", stderr);
#endif

  while(p < end) {
    unsigned long c = 0;
    char *endptr;

    if(*p == ' ') {
      p++;
      continue;
    }
    
    c = (unsigned long)strtol(p, &endptr, 16);

#ifdef RAPTOR_NFC_DECODE_DEBUG
    fprintf(stderr, "U+%04lX ", c);
#endif

    p = (const char*)endptr;
    
    u += raptor_unicode_utf8_string_put_char(c, u, (end-p));
    
    if((u-utf8_string) > (int)utf8_string_length) {
      fprintf(stderr,
              "decode_to_utf8 overwote utf8_string buffer at byte %d\n",
              (u-utf8_string));
      abort();
    }
  }

#ifdef RAPTOR_NFC_DECODE_DEBUG
  fputs("\n", stderr);
#endif

  return u-utf8_string;
}
Beispiel #2
0
/*
 * raptor_ntriples_parse_term_internal:
 * @world: raptor world
 * @locator: locator object (in/out) (or NULL)
 * @start: pointer to starting character of string (in)
 * @dest: destination of string (in)
 * @lenp: pointer to length of string (in/out)
 * @dest_lenp: pointer to length of destination string (out)
 * @end_char: string ending character
 * @class: string class
 *
 * INTERNAL - Parse an N-Triples term with escapes.
 *
 * Relies that @dest is long enough; it need only be as large as the
 * input string @start since when UTF-8 encoding, the escapes are
 * removed and the result is always less than or equal to length of
 * input.
 *
 * N-Triples strings / URIs are written in ASCII at present;
 * characters outside the printable ASCII range are discarded with a
 * warning.  See the grammar for full details of the allowed ranges.
 *
 * UTF-8 and the \u and \U esapes are both allowed.
 *
 * Return value: Non 0 on failure
 **/
static int
raptor_ntriples_parse_term_internal(raptor_world* world,
                                    raptor_locator* locator,
                                    const unsigned char **start,
                                    unsigned char *dest,
                                    size_t *lenp, size_t *dest_lenp,
                                    char end_char,
                                    raptor_ntriples_term_class term_class)
{
  const unsigned char *p = *start;
  unsigned char c = '\0';
  size_t ulen = 0;
  unsigned long unichar = 0;
  unsigned int position = 0;
  int end_char_seen = 0;

  /* find end of string, fixing backslashed characters on the way */
  while(*lenp > 0) {
    int unichar_width;

    c = *p;

    p++;
    (*lenp)--;
    if(locator) {
      locator->column++;
      locator->byte++;
    }

    if(c > 0x7f) {
      /* just copy the UTF-8 bytes through */
      int unichar_len;
      unichar_len = raptor_unicode_utf8_string_get_char(p - 1, 1 + *lenp, NULL);
      if(unichar_len < 0 || RAPTOR_GOOD_CAST(size_t, unichar_len) > *lenp) {
        raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "UTF-8 encoding error at character %d (0x%02X) found.", c, c);
        /* UTF-8 encoding had an error or ended in the middle of a string */
        return 1;
      }
      memmove(dest, p-1, unichar_len);
      dest += unichar_len;

      unichar_len--; /* p, *lenp were moved on by 1 earlier */

      p += unichar_len;
      (*lenp) -= unichar_len;
      if(locator) {
        locator->column += unichar_len;
        locator->byte += unichar_len;
      }
      continue;
    }

    if(c != '\\') {
      /* finish at non-backslashed end_char */
      if(end_char && c == end_char) {
        end_char_seen = 1;
        break;
      }

      if(!raptor_ntriples_term_valid(c, position, term_class)) {
        if(end_char) {
          /* end char was expected, so finding an invalid thing is an error */
          raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Missing terminating '%c' (found '%c')", end_char, c);
          return 0;
        } else {
          /* it's the end - so rewind 1 to save next char */
          p--;
          (*lenp)++;
          if(locator) {
            locator->column--;
            locator->byte--;
          }
          if(term_class == RAPTOR_TERM_CLASS_BNODEID && dest[-1] == '.') {
            /* If bnode id ended on '.' move back one */
            dest--;

            p--;
            (*lenp)++;
            if(locator) {
              locator->column--;
              locator->byte--;
            }
          }
          break;
        }
      }

      /* otherwise store and move on */
      *dest++ = c;
      position++;
      continue;
    }

    if(!*lenp) {
      raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "\\ at end of input.");
      return 0;
    }

    c = *p;

    p++;
    (*lenp)--;
    if(locator) {
      locator->column++;
      locator->byte++;
    }

    switch(c) {
      case '"':
      case '\\':
        *dest++ = c;
        break;
      case 'b':
        *dest++ = '\b';
        break;
      case 'f':
        *dest++ = '\f';
        break;
      case 'n':
        *dest++ = '\n';
        break;
      case 'r':
        *dest++ = '\r';
        break;
      case 't':
        *dest++ = '\t';
        break;
      case '<':
      case '>':
      case '{':
      case '}':
      case '|':
      case '^':
      case '`':
        /* Turtle 2013 allows these in URIs (as well as \" and \\) */
        *dest++ = c;
        break;

      case 'u':
      case 'U':
        ulen = (c == 'u') ? 4 : 8;

        if(*lenp < ulen) {
          raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "%c over end of input.", c);
          return 0;
        }

        if(1) {
          unsigned int ii;
          int n = 0;

          for(ii = 0; ii < ulen; ii++) {
            char cc = p[ii];
            if(!isxdigit(RAPTOR_GOOD_CAST(char, cc))) {
              raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "N-Triples string error - illegal hex digit %c in Unicode escape '%c%s...'",
                            cc, c, p);
              n = 1;
              break;
            }
          }

          if(n)
            break;

          n = sscanf((const char*)p, ((ulen == 4) ? "%04lx" : "%08lx"), &unichar);
          if(n != 1) {
            raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal Uncode escape '%c%s...'", c, p);
            break;
          }
        }

        p += ulen;
        (*lenp) -= ulen;
        if(locator) {
          locator->column += RAPTOR_GOOD_CAST(int, ulen);
          locator->byte += RAPTOR_GOOD_CAST(int, ulen);
        }

        if(unichar > raptor_unicode_max_codepoint) {
          raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal Unicode character with code point #x%lX (max #x%lX).", unichar, raptor_unicode_max_codepoint);
          break;
        }

        unichar_width = raptor_unicode_utf8_string_put_char(unichar, dest, 4);
        if(unichar_width < 0) {
          raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal Unicode character with code point #x%lX.", unichar);
          break;
        }

        /* The destination length is set here to 4 since we know that in
         * all cases, the UTF-8 encoded output sequence is always shorter
         * than the input sequence, and the buffer is edited in place.
         *   \uXXXX: 6 bytes input - UTF-8 max 3 bytes output
         *   \uXXXXXXXX: 10 bytes input - UTF-8 max 4 bytes output
         */
        dest += (int)unichar_width;
        break;

      default:
        raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal string escape \\%c in \"%s\"", c, (char*)start);
        return 0;
    }

    position++;
  } /* end while */
/**
 * raptor_stringbuffer_append_turtle_string:
 * @stringbuffer: String buffer to add to
 * @text: turtle string to decode
 * @len: length of string
 * @delim: terminating delimiter for string - only ', " or &gt; are allowed
 * @error_handler: error handling function
 * @error_data: error handler data
 *
 * Append to a stringbuffer a Turtle-escaped string.
 *
 * The passed in string is handled according to the Turtle string
 * escape rules giving a UTF-8 encoded output of the Unicode codepoints.
 *
 * The Turtle escapes are \n \r \t \\
 * \uXXXX \UXXXXXXXX where X is [A-F0-9]
 * 
 * Return value: non-0 on failure
 **/
int
raptor_stringbuffer_append_turtle_string(raptor_stringbuffer* stringbuffer,
                                         const unsigned char *text,
                                         size_t len, int delim,
                                         raptor_simple_message_handler error_handler, 
                                         void *error_data)
{
  size_t i;
  const unsigned char *s;
  unsigned char *d;
  unsigned char *string = RAPTOR_MALLOC(unsigned char*, len + 1);
  
  if(!string)
    return -1;

  for(s = text, d = string, i = 0; i < len; s++, i++) {
    unsigned char c=*s;

    if(c == '\\' ) {
      s++; i++;
      c=*s;
      if(c == 'n')
        *d++= '\n';
      else if(c == 'r')
        *d++= '\r';
      else if(c == 't')
        *d++= '\t';
      else if(c == '\\' || c == delim)
        *d++=c;
      else if(c == 'u' || c == 'U') {
        size_t ulen = (c == 'u') ? 4 : 8;
        unsigned long unichar = 0;
        int n;
        int unichar_width;

        s++; i++;
        if(i+ulen > len) {
          error_handler(error_data,
                        "Turtle string error - \\%c over end of line", c);
          RAPTOR_FREE(char*, string);
          return 1;
        }
        
        n = sscanf((const char*)s, ((ulen == 4) ? "%04lx" : "%08lx"), &unichar);
        if(n != 1) {
          error_handler(error_data,
                        "Turtle string error - illegal Uncode escape '%c%s...'",
                        c, s);
          RAPTOR_FREE(char*, string);
          return 1;
        }

        s+= ulen-1;
        i+= ulen-1;
        
        if(unichar > raptor_unicode_max_codepoint) {
          error_handler(error_data,
                        "Turtle string error - illegal Unicode character with code point #x%lX (max #x%lX).", 
                        unichar, raptor_unicode_max_codepoint);
          RAPTOR_FREE(char*, string);
          return 1;
        }
          
        unichar_width = raptor_unicode_utf8_string_put_char(unichar, d, 
                                                            len-(d-string));
        if(unichar_width < 0) {
          error_handler(error_data,
                        "Turtle string error - illegal Unicode character with code point #x%lX.", 
                        unichar);
          RAPTOR_FREE(char*, string);
          return 1;
        }
        d += (size_t)unichar_width;

      } else {
        /* don't handle \x where x isn't one of: \t \n \r \\ (delim) */
        error_handler(error_data,
                      "Turtle string error - illegal escape \\%c (#x%02X) in \"%s\"", 
                      c, c, text);
      }
    } else
      *d++=c;
  }
  *d='\0';

  /* calculate output string size */
  len = d-string;
  
  /* string gets owned by the stringbuffer after this */
  return raptor_stringbuffer_append_counted_string(stringbuffer, 
                                                   string, len, 0);

}
Beispiel #4
0
/**
 * rasqal_escaped_name_to_utf8_string:
 * @src: source name string
 * @len: length of source name string
 * @dest_lenp: pointer to store result string (or NULL)
 * @error_handler: error handling function
 * @error_data: data for error handle
 *
 * Get a UTF-8 and/or \u-escaped name as UTF-8.
 *
 * If dest_lenp is not NULL, the length of the resulting string is
 * stored at the pointed size_t.
 *
 * Return value: new UTF-8 string or NULL on failure.
 */
unsigned char*
rasqal_escaped_name_to_utf8_string(const unsigned char *src, size_t len,
                                   size_t *dest_lenp,
                                   int (*error_handler)(rasqal_query *error_data, const char *message, ...),
                                   rasqal_query* error_data)
{
  const unsigned char *p=src;
  size_t ulen=0;
  unsigned long unichar=0;
  unsigned char *result;
  unsigned char *dest;
  unsigned char *endp;
  int n;
  
  result = RASQAL_MALLOC(unsigned char*, len + 1);
  if(!result)
    return NULL;

  dest = result;
  endp = result + len;

  /* find end of string, fixing backslashed characters on the way */
  while(len > 0) {
    unsigned char c=*p;

    if(c > 0x7f) {
      /* just copy the UTF-8 bytes through */
      size_t unichar_len = raptor_unicode_utf8_string_get_char(RASQAL_GOOD_CAST(const unsigned char*, p), len + 1, NULL);
      if(unichar_len > len) {
        if(error_handler)
          error_handler(error_data, "UTF-8 encoding error at character %d (0x%02X) found.", c, c);
        /* UTF-8 encoding had an error or ended in the middle of a string */
        RASQAL_FREE(char*, result);
        return NULL;
      }
      memcpy(dest, p, unichar_len);
      dest+= unichar_len;
      p += unichar_len;
      len -= unichar_len;
      continue;
    }

    p++; len--;
    
    if(c != '\\') {
      /* not an escape - store and move on */
      *dest++=c;
      continue;
    }

    if(!len) {
      RASQAL_FREE(char*, result);
      return NULL;
    }

    c = *p++; len--;

    switch(c) {
      case '"':
      case '\\':
        *dest++=c;
        break;
      case 'u':
      case 'U':
        ulen=(c == 'u') ? 4 : 8;
        
        if(len < ulen) {
          if(error_handler)
            error_handler(error_data, "%c over end of line", c);
          RASQAL_FREE(char*, result);
          return 0;
        }
        
        n = sscanf(RASQAL_GOOD_CAST(const char*, p), ((ulen == 4) ? "%04lx" : "%08lx"), &unichar);
        if(n != 1) {
          if(error_handler)
            error_handler(error_data, "Bad %c escape", c);
          break;
        }
        
        p+=ulen;
        len-=ulen;
        
        if(unichar > 0x10ffff) {
          if(error_handler)
            error_handler(error_data, "Illegal Unicode character with code point #x%lX.", unichar);
          break;
        }
          
        dest += raptor_unicode_utf8_string_put_char(unichar, dest, endp - dest);
        break;

      default:
        if(error_handler)
          error_handler(error_data, "Illegal string escape \\%c in \"%s\"", c, src);
        RASQAL_FREE(char*, result);
        return 0;
    }

  } /* end while */