static int translate_latin1(InputSource s) { SETUP; while(nextin < insize) { c = inbuf[nextin++]; if(!is_xml_legal(c, map)) { sprintf(s->error_msg, "Illegal character <0x%x> " "immediately before file offset %d", c, s->bytes_consumed + nextin - startin); c = -1; } ERROR_CHECK; LINEFEED; OUTPUT; } MORE_BYTES; END_OF_LINE; }
static int translate_latin(InputSource s) { CharacterEncoding enc = s->entity->encoding; int *to_unicode = iso_to_unicode[enc - CE_ISO_8859_2]; SETUP; while(nextin < insize) { c = to_unicode[inbuf[nextin++]]; if(c == -1) { sprintf(s->error_msg, "Illegal byte <0x%x> for encoding %s at file offset %d", inbuf[nextin-1], CharacterEncodingName[enc], s->bytes_consumed + nextin - 1 - startin); } else if(!is_xml_legal(c, map)) { sprintf(s->error_msg, "Illegal character <0x%x> " "immediately before file offset %d", c, s->bytes_consumed + nextin - startin); c = -1; } ERROR_CHECK; LINEFEED; OUTPUT; } MORE_BYTES; END_OF_LINE; }
static int get_translated_line1(InputSource s) { unsigned int c; /* can't use Char, it might be >0x10000 */ unsigned char *inbuf = s->inbuf; int nextin = s->nextin, insize = s->insize; int startin = s->nextin; Char *outbuf = s->line; int outsize = s->line_alloc; int nextout = 0; int remaining = 0; int ignore_linefeed = s->line_end_was_cr; #if CHAR_SIZE == 16 int *to_unicode = 0; /* initialize to shut gcc up */ CharacterEncoding enc = s->entity->encoding; int more, i; s->complicated_utf8_line = 0; if(enc >= CE_ISO_8859_2 && enc <= CE_ISO_8859_9) to_unicode = iso_to_unicode[enc - CE_ISO_8859_2]; #endif s->line_end_was_cr = 0; s->bytes_before_current_line = s->bytes_consumed; while(1) { /* There are never more characters than bytes in the input */ if(outsize < nextout + (insize - nextin)) { outsize = nextout + (insize - nextin); outbuf = Realloc(outbuf, outsize * sizeof(Char)); } while(nextin < insize) { #if CHAR_SIZE == 8 c = inbuf[nextin++]; #else switch(enc) { case CE_ISO_10646_UCS_2B: case CE_UTF_16B: if(nextin+2 > insize) goto more_bytes; c = (inbuf[nextin] << 8) + inbuf[nextin+1]; nextin += 2; break; case CE_ISO_10646_UCS_2L: case CE_UTF_16L: if(nextin+2 > insize) goto more_bytes; c = (inbuf[nextin+1] << 8) + inbuf[nextin]; nextin += 2; break; case CE_ISO_8859_1: case CE_unspecified_ascii_superset: c = inbuf[nextin++]; break; case CE_ISO_8859_2: case CE_ISO_8859_3: case CE_ISO_8859_4: case CE_ISO_8859_5: case CE_ISO_8859_6: case CE_ISO_8859_7: case CE_ISO_8859_8: case CE_ISO_8859_9: c = to_unicode[inbuf[nextin++]]; if(c == (unsigned int)-1) ERR3("Illegal %s character <0x%x> " "at file offset %d\n", CharacterEncodingName[enc], inbuf[nextin-1], s->bytes_consumed + nextin - 1 - startin); break; case CE_UTF_8: c = inbuf[nextin++]; if(c <= 0x7f) break; if(c <= 0xc0 || c >= 0xfe) { ERR2("Illegal UTF-8 start byte <0x%x> " "at file offset %d\n", c, s->bytes_consumed + nextin - 1 - startin); return -1; } if(c <= 0xdf) { c &= 0x1f; more = 1; } else if(c <= 0xef) { c &= 0x0f; more = 2; } else if(c <= 0xf7) { c &= 0x07; more = 3; } else if(c <= 0xfb) { c &= 0x03; more = 4; } else { c &= 0x01; more = 5; } if(nextin+more > insize) { nextin--; goto more_bytes; } s->complicated_utf8_line = 1; for(i=0; i<more; i++) c = (c << 6) + (inbuf[nextin++] & 0x3f); break; default: ERR("read from entity with unsupported encoding!\n"); return -1; } if(c > 0x110000 || (c < 0x10000 && !is_xml_legal(c))) if(!(enc == CE_UTF_16L || enc == CE_UTF_16B) || c < 0xd800 || c > 0xdfff) /* We treat the surrogates as legal because we didn't combine them when translating from UTF-16. XXX */ { ERR2("Error: illegal character <0x%x> " "immediately before file offset %d\n", c, s->bytes_consumed + nextin - startin); return -1; } #endif if(c == '\n' && ignore_linefeed) { /* Ignore lf at start of line if last line ended with cr */ ignore_linefeed = 0; s->bytes_before_current_line += (nextin - startin); } else { ignore_linefeed = 0; if(c == '\r') { s->line_end_was_cr = 1; c = '\n'; } #if CHAR_SIZE == 16 if(c >= 0x10000) { /* Use surrogates */ outbuf[nextout++] = ((c - 0x10000) >> 10) + 0xd800; outbuf[nextout++] = ((c - 0x10000) & 0x3ff) + 0xdc00; } else outbuf[nextout++] = c; #else outbuf[nextout++] = c; #endif if(c == '\n') { s->nextin = nextin; s->insize = insize; s->bytes_consumed += (nextin - startin); s->line = outbuf; s->line_alloc = outsize; s->line_length = nextout; return 0; } } }
static int translate_utf16(InputSource s) { int le = (s->entity->encoding == CE_ISO_10646_UCS_2L || s->entity->encoding == CE_UTF_16L); SETUP; while(nextin < insize) { if(nextin+2 > insize) goto more_bytes; if(le) c = (inbuf[nextin+1] << 8) + inbuf[nextin]; else c = (inbuf[nextin] << 8) + inbuf[nextin+1]; nextin += 2; if(c >= 0xdc00 && c <= 0xdfff) /* low (2nd) surrogate */ { if(s->expecting_low_surrogate) s->expecting_low_surrogate = 0; else { sprintf(s->error_msg, "Unexpected low surrogate <0x%x> " "at file offset %d", c, s->bytes_consumed + nextin - startin - 2); c = -1; } } else if(s->expecting_low_surrogate) { sprintf(s->error_msg, "Expected low surrogate but got <0x%x> " "at file offset %d", c, s->bytes_consumed + nextin - startin - 2); c = -1; } if(c >= 0xd800 && c <= 0xdbff) /* high (1st) surrogate */ s->expecting_low_surrogate = 1; if(c >= 0 && !is_xml_legal(c, map) && /* surrogates are legal in utf-16 */ !(c >= 0xd800 && c <= 0xdfff)) { sprintf(s->error_msg, "Illegal character <0x%x> " "immediately before file offset %d", c, s->bytes_consumed + nextin - startin); c = -1; } ERROR_CHECK; LINEFEED; OUTPUT; } MORE_BYTES; END_OF_LINE; }
static int translate_utf8(InputSource s) { int more, i, mincode; SETUP; while(nextin < insize) { c = inbuf[nextin++]; if(c <= 0x7f) goto gotit; else if(c <= 0xc0 || c >= 0xfe) { sprintf(s->error_msg, "Illegal UTF-8 start byte <0x%x> at file offset %d", c, s->bytes_consumed + nextin - 1 - startin); c = -1; goto gotit; } else if(c <= 0xdf) { c &= 0x1f; more = 1; mincode = 0x80; } else if(c <= 0xef) { c &= 0x0f; more = 2; mincode = 0x800; } else if(c <= 0xf7) { c &= 0x07; more = 3; mincode = 0x10000; } else if(c <= 0xfb) { c &= 0x03; more = 4; mincode = 0x200000; } else { c &= 0x01; more = 5; mincode = 0x4000000; } if(nextin+more > insize) { nextin--; goto more_bytes; } s->complicated_utf8_line = 1; s->cached_line_char = 0; s->cached_line_byte = 0; for(i=0; i<more; i++) { int t = inbuf[nextin++]; if((t & 0xc0) != 0x80) { c = -1; sprintf(s->error_msg, "Illegal UTF-8 byte %d <0x%x> at file offset %d", i+2, t, s->bytes_consumed + nextin - 1 - startin); break; } c = (c << 6) + (t & 0x3f); } if(c < mincode && c != -1) { sprintf(s->error_msg, "Illegal (non-shortest) UTF-8 sequence for " "character <0x%x> " "immediately before file offset %d", c, s->bytes_consumed + nextin - startin); c = -1; } gotit: if(c >= 0 && !is_xml_legal(c, map)) { sprintf(s->error_msg, "Illegal character <0x%x> " "immediately before file offset %d", c, s->bytes_consumed + nextin - startin); c = -1; } ERROR_CHECK; LINEFEED; OUTPUT_WITH_SURROGATES; if(c == '>' && s->read_carefully) { s->line_is_incomplete = 1; goto end_of_line; } } MORE_BYTES; END_OF_LINE; }