Example #1
0
static int translate_latin1(InputSource s)
{
	SETUP;

	while(nextin < insize)
	{
	c = inbuf[nextin++];
	if(!is_xml_legal(c, map))
	{
		sprintf(s->error_msg,
			"Illegal character <0x%x> "
			"immediately before file offset %d",
			c, s->bytes_consumed + nextin - startin);
		c = -1;
	}

	ERROR_CHECK;

	LINEFEED;

	OUTPUT;
	}

	MORE_BYTES;

	END_OF_LINE;
}
Example #2
0
static int translate_latin(InputSource s)
{
	CharacterEncoding enc = s->entity->encoding;
	int *to_unicode = iso_to_unicode[enc - CE_ISO_8859_2];
	SETUP;

	while(nextin < insize)
	{
	c = to_unicode[inbuf[nextin++]];
	if(c == -1)
	{
		sprintf(s->error_msg,
			"Illegal byte <0x%x> for encoding %s at file offset %d",
			inbuf[nextin-1], CharacterEncodingName[enc],
			s->bytes_consumed + nextin - 1 - startin);
	}
	else if(!is_xml_legal(c, map))
	{
		sprintf(s->error_msg,
			"Illegal character <0x%x> "
			"immediately before file offset %d",
			c, s->bytes_consumed + nextin - startin);
		c = -1;
	}

	ERROR_CHECK;

	LINEFEED;

	OUTPUT;
	}

	MORE_BYTES;

	END_OF_LINE;
}
Example #3
0
static int get_translated_line1(InputSource s)
{
    unsigned int c;		/* can't use Char, it might be >0x10000 */
    unsigned char *inbuf = s->inbuf;
    int nextin = s->nextin, insize = s->insize;
    int startin = s->nextin;
    Char *outbuf = s->line;
    int outsize = s->line_alloc;
    int nextout = 0;
    int remaining = 0;
    int ignore_linefeed = s->line_end_was_cr;

#if CHAR_SIZE == 16

    int *to_unicode = 0;	/* initialize to shut gcc up */
    CharacterEncoding enc = s->entity->encoding;
    int more, i;
    s->complicated_utf8_line = 0;

    if(enc >= CE_ISO_8859_2 && enc <= CE_ISO_8859_9)
	to_unicode = iso_to_unicode[enc - CE_ISO_8859_2];

#endif

    s->line_end_was_cr = 0;
    s->bytes_before_current_line = s->bytes_consumed;

    while(1)
    {
	/* There are never more characters than bytes in the input */
	if(outsize < nextout + (insize - nextin))
	{
	    outsize = nextout + (insize - nextin);
	    outbuf = Realloc(outbuf, outsize * sizeof(Char));
	}

	while(nextin < insize)
	{
#if CHAR_SIZE == 8
	    c = inbuf[nextin++];
#else
	    switch(enc)
	    {
	    case CE_ISO_10646_UCS_2B:
	    case CE_UTF_16B:
		if(nextin+2 > insize)
		    goto more_bytes;
		c = (inbuf[nextin] << 8) + inbuf[nextin+1];
		nextin += 2;
		break;
	    case CE_ISO_10646_UCS_2L:
	    case CE_UTF_16L:
		if(nextin+2 > insize)
		    goto more_bytes;
		c = (inbuf[nextin+1] << 8) + inbuf[nextin];
		nextin += 2;
		break;
	    case CE_ISO_8859_1:
	    case CE_unspecified_ascii_superset:
		c = inbuf[nextin++];
		break;
	    case CE_ISO_8859_2:
	    case CE_ISO_8859_3:
	    case CE_ISO_8859_4:
	    case CE_ISO_8859_5:
	    case CE_ISO_8859_6:
	    case CE_ISO_8859_7:
	    case CE_ISO_8859_8:
	    case CE_ISO_8859_9:
		c = to_unicode[inbuf[nextin++]];
		if(c == (unsigned int)-1)
		  ERR3("Illegal %s character <0x%x> "
			    "at file offset %d\n",
			    CharacterEncodingName[enc], inbuf[nextin-1],
			    s->bytes_consumed + nextin - 1 - startin);
		break;
	    case CE_UTF_8:
		c = inbuf[nextin++];
		if(c <= 0x7f)
		    break;
		if(c <= 0xc0 || c >= 0xfe)
		{
		  ERR2("Illegal UTF-8 start byte <0x%x> "
			    "at file offset %d\n",
			    c, s->bytes_consumed + nextin - 1 - startin);
		    return -1;
		}
		if(c <= 0xdf)
		{
		    c &= 0x1f;
		    more = 1;
		}
		else if(c <= 0xef)
		{
		    c &= 0x0f;
		    more = 2;
		}
		else if(c <= 0xf7)
		{
		    c &= 0x07;
		    more = 3;
		}
		else if(c <= 0xfb)
		{
		    c &= 0x03;
		    more = 4;
		}
		else
		{
		    c &= 0x01;
		    more = 5;
		}
		if(nextin+more > insize)
		{
		    nextin--;
		    goto more_bytes;
		}
		s->complicated_utf8_line = 1;
		for(i=0; i<more; i++)
		    c = (c << 6) + (inbuf[nextin++] & 0x3f);
		break;
	    default:
	      ERR("read from entity with unsupported encoding!\n");
		return -1;
	    }

	    if(c > 0x110000 || (c < 0x10000 && !is_xml_legal(c)))
		if(!(enc == CE_UTF_16L || enc == CE_UTF_16B) ||
		   c < 0xd800 || c > 0xdfff)
		    /* We treat the surrogates as legal because we didn't
		       combine them when translating from UTF-16.  XXX */
		{
		  ERR2("Error: illegal character <0x%x> "
			    "immediately before file offset %d\n",
			    c, s->bytes_consumed + nextin - startin);
		    return -1;
		}
#endif
	    if(c == '\n' && ignore_linefeed)
	    {
		/* Ignore lf at start of line if last line ended with cr */
		ignore_linefeed = 0;
		s->bytes_before_current_line += (nextin - startin);
	    }		
	    else
	    {
		ignore_linefeed = 0;
		if(c == '\r')
		{
		    s->line_end_was_cr = 1;
		    c = '\n';
		}

#if CHAR_SIZE == 16
		if(c >= 0x10000)
		{
		    /* Use surrogates */
		    outbuf[nextout++] = ((c - 0x10000) >> 10) + 0xd800;
		    outbuf[nextout++] = ((c - 0x10000) & 0x3ff) + 0xdc00;
		}
		else
		    outbuf[nextout++] = c;
#else
		outbuf[nextout++] = c;
#endif

		if(c == '\n')
		{
		    s->nextin = nextin;
		    s->insize = insize;
		    s->bytes_consumed += (nextin - startin);
		    s->line = outbuf;
		    s->line_alloc = outsize;
		    s->line_length = nextout;
		    return 0;
		}
	    }
	}
Example #4
0
static int translate_utf16(InputSource s)
{
	int le = (s->entity->encoding == CE_ISO_10646_UCS_2L ||
		  s->entity->encoding == CE_UTF_16L);
	SETUP;

	while(nextin < insize)
	{
	if(nextin+2 > insize)
		goto more_bytes;

	if(le)
		c = (inbuf[nextin+1] << 8) + inbuf[nextin];
	else
		c = (inbuf[nextin] << 8) + inbuf[nextin+1];
	nextin += 2;

	if(c >= 0xdc00 && c <= 0xdfff) /* low (2nd) surrogate */
	{
		if(s->expecting_low_surrogate)
		s->expecting_low_surrogate = 0;
		else
		{
		sprintf(s->error_msg,
			"Unexpected low surrogate <0x%x> "
			"at file offset %d",
			c, s->bytes_consumed + nextin - startin - 2);
		c = -1;
		}
	}
	else if(s->expecting_low_surrogate)
	{
		sprintf(s->error_msg,
			"Expected low surrogate but got <0x%x> "
			"at file offset %d",
			c, s->bytes_consumed + nextin - startin - 2);
		c = -1;
	}
	if(c >= 0xd800 && c <= 0xdbff) /* high (1st) surrogate */
		s->expecting_low_surrogate = 1;

	if(c >= 0 && !is_xml_legal(c, map) &&
	   /* surrogates are legal in utf-16 */
	   !(c >= 0xd800 && c <= 0xdfff))
	{
		sprintf(s->error_msg,
			"Illegal character <0x%x> "
			"immediately before file offset %d",
			c, s->bytes_consumed + nextin - startin);
		c = -1;
	}

	ERROR_CHECK;

	LINEFEED;

	OUTPUT;
	}

	MORE_BYTES;

	END_OF_LINE;
}
Example #5
0
static int translate_utf8(InputSource s)
{
	int more, i, mincode;
	SETUP;

	while(nextin < insize)
	{
	c = inbuf[nextin++];
	if(c <= 0x7f)
		goto gotit;
	else if(c <= 0xc0 || c >= 0xfe)
	{
		sprintf(s->error_msg,
		   "Illegal UTF-8 start byte <0x%x> at file offset %d",
			c, s->bytes_consumed + nextin - 1 - startin);
		c = -1;
		goto gotit;
	}
	else if(c <= 0xdf)
	{
		c &= 0x1f;
		more = 1;
		mincode = 0x80;
	}
	else if(c <= 0xef)
	{
		c &= 0x0f;
		more = 2;
		mincode = 0x800;
	}
	else if(c <= 0xf7)
	{
		c &= 0x07;
		more = 3;
		mincode = 0x10000;
	}
	else if(c <= 0xfb)
	{
		c &= 0x03;
		more = 4;
		mincode = 0x200000;
	}
	else
	{
		c &= 0x01;
		more = 5;
		mincode = 0x4000000;
	}
	if(nextin+more > insize)
	{
		nextin--;
		goto more_bytes;
	}
	s->complicated_utf8_line = 1;
	s->cached_line_char = 0;
	s->cached_line_byte = 0;

	for(i=0; i<more; i++)
	{
		int t = inbuf[nextin++];
		if((t & 0xc0) != 0x80)
		{
		c = -1;
		sprintf(s->error_msg,
			  "Illegal UTF-8 byte %d <0x%x> at file offset %d",
			i+2, t,
			s->bytes_consumed + nextin - 1 - startin);
		break;
		}
		c = (c << 6) + (t & 0x3f);
	}

	if(c < mincode && c != -1)
	{
		sprintf(s->error_msg,
			"Illegal (non-shortest) UTF-8 sequence for "
			"character <0x%x> "
			"immediately before file offset %d",
			c, s->bytes_consumed + nextin - startin);
		c = -1;
	}

	gotit:
	if(c >= 0 && !is_xml_legal(c, map))
	{
		sprintf(s->error_msg,
			"Illegal character <0x%x> "
			"immediately before file offset %d",
			c, s->bytes_consumed + nextin - startin);
		c = -1;
	}

	ERROR_CHECK;

	LINEFEED;

	OUTPUT_WITH_SURROGATES;

	if(c == '>' && s->read_carefully)
	{
		s->line_is_incomplete = 1;
		goto end_of_line;
	}
	}

	MORE_BYTES;

	END_OF_LINE;
}