Beispiel #1
0
int UArray_isLegalUTF8(const UArray *self)
{
    uint8_t* sourceStart = self->data;
    uint8_t* sourceEnd   = self->data + self->size * self->itemSize;

    return isLegalUTF8Sequence(sourceStart, sourceEnd);
}
Beispiel #2
0
*/	REBFLG Legal_UTF8_Char(const REBYTE *str, REBCNT len)
/*
**		Returns TRUE if char is legal.
**
***********************************************************************/
{
	return isLegalUTF8Sequence(str, str + len);
}
static unsigned
findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
                                          const UTF8 *sourceEnd) {
  UTF8 b1, b2, b3;

  assert(!isLegalUTF8Sequence(source, sourceEnd));

  /*
   * Unicode 6.3.0, D93b:
   *
   *   Maximal subpart of an ill-formed subsequence: The longest code unit
   *   subsequence starting at an unconvertible offset that is either:
   *   a. the initial subsequence of a well-formed code unit sequence, or
   *   b. a subsequence of length one.
   */

  if (source == sourceEnd)
    return 0;

  /*
   * Perform case analysis.  See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
   * Byte Sequences.
   */

  b1 = *source;
  ++source;
  if (b1 >= 0xC2 && b1 <= 0xDF) {
    /*
     * First byte is valid, but we know that this code unit sequence is
     * invalid, so the maximal subpart has to end after the first byte.
     */
    return 1;
  }

  if (source == sourceEnd)
    return 1;

  b2 = *source;
  ++source;

  if (b1 == 0xE0) {
    return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
  }
  if (b1 >= 0xE1 && b1 <= 0xEC) {
    return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
  }
  if (b1 == 0xED) {
    return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
  }
  if (b1 >= 0xEE && b1 <= 0xEF) {
    return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
  }
  if (b1 == 0xF0) {
    if (b2 >= 0x90 && b2 <= 0xBF) {
      if (source == sourceEnd)
        return 2;

      b3 = *source;
      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
    }
    return 1;
  }
  if (b1 >= 0xF1 && b1 <= 0xF3) {
    if (b2 >= 0x80 && b2 <= 0xBF) {
      if (source == sourceEnd)
        return 2;

      b3 = *source;
      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
    }
    return 1;
  }
  if (b1 == 0xF4) {
    if (b2 >= 0x80 && b2 <= 0x8F) {
      if (source == sourceEnd)
        return 2;

      b3 = *source;
      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
    }
    return 1;
  }

  assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
  /*
   * There are no valid sequences that start with these bytes.  Maximal subpart
   * is defined to have length 1 in these cases.
   */
  return 1;
}
Beispiel #4
0
//
//  Legal_UTF8_Char: C
// 
// Returns TRUE if char is legal.
//
REBOOL Legal_UTF8_Char(const REBYTE *str, REBCNT len)
{
    return LOGICAL(isLegalUTF8Sequence(str, str + len));
}
Beispiel #5
0
char *text2html(unsigned char *buffer, int buffer_len, int flags, char *fonttag)
{
	unsigned char *saved_buffer = buffer;
	string str;

	if (string_initialize(&str,1024))
	{
		char buf[512];
		int last_color = 0; /* the color of the current line */
		int eval_color = 2; /* recheck the color */
		int initial_color = 1;
		int level = 0;
		int line = 0; /* the type of the line */

		if (flags & TEXT2HTML_BODY_TAG)
		{
			sm_snprintf(buf,sizeof(buf),"<BODY BGCOLOR=\"#%06x\" TEXT=\"#%06x\" LINK=\"#%06x\">",user.config.read_background,user.config.read_text,user.config.read_link);
			string_append(&str,buf);
		}
		string_append(&str,fonttag); /* accepts NULL pointer */

		/* check for >0, otherwise we'll be in an endless loop if buffer_len becomes <0 */
		while (buffer_len > 0)
		{
			if (eval_color)
			{
				int new_level = 0;
				int buffer2_len = buffer_len;
				unsigned char *buffer2 = buffer;
				int new_color = 0;

				/* Determine the citation level. Afterwards, buffer2 will point to the end of the citation symbols. */
				while (buffer2_len)
				{
					unsigned char c = *buffer2;
					if (c == '>')
					{
						new_level++;
						if (new_color == 1) new_color = 2;
						else new_color = 1;
					} else
					{
						if (c != ' ') break;
						if (c == ' ' && !new_level) break;
					}
					buffer2_len--;
					buffer2++;
				}

				if (user.config.read_graphical_quote_bar)
				{
					/* When graphical quote bar is enabled we skip all quotation symbols */
					buffer = buffer2;
					buffer_len = buffer2_len;

					if (level != new_level)
					{
						char *begin_quote_string = "<TABLE BGCOLOR=\"#%06x\" WIDTH=\"100%%\" STYLE=\"border-left: 3px solid #%06x; border-right: 3px solid #%06x;\"><TD><FONT COLOR=\"#%06x\">";
						char *end_quote_string = "</FONT></TD></TABLE>";

						/* If new level is larger */
						for (;level < new_level; level++)
						{
							unsigned int color = level%2?user.config.read_quoted:user.config.read_old_quoted;
							sm_snprintf(buf,sizeof(buf),begin_quote_string,user.config.read_quoted_background,color,color,color);
							string_append(&str,buf);
						}

						/* If new level is lower */
						for (;level > new_level; level--)
							string_append(&str,end_quote_string);
					}
				} else
				{
					if (last_color != new_color)
					{
						char *begin_quote_string = "<FONT COLOR=\"#%x\">";
						char *end_quote_string = "</FONT>";

						if (!initial_color) string_append(&str,end_quote_string);
						if (new_color == 1)
						{
							sm_snprintf(buf,sizeof(buf),begin_quote_string,user.config.read_quoted);
							string_append(&str,buf);
						}
						else if (new_color == 2)
						{
							sm_snprintf(buf,sizeof(buf),begin_quote_string,user.config.read_old_quoted);
							string_append(&str,buf);
						}
						last_color = new_color;
						if (new_color) initial_color = 0;
						else initial_color = 1;
					}
				}
				eval_color = 0;
			}

			if (!mystrnicmp("http:",(char*)buffer,5)) write_uri(&buffer, &buffer_len, &str);
			else if (!mystrnicmp("mailto:",(char*)buffer,7)) write_uri(&buffer, &buffer_len, &str);
			else if (!mystrnicmp("ftp:",(char*)buffer,4)) write_uri(&buffer, &buffer_len, &str);
			else if (!mystrnicmp("https:",(char*)buffer,6)) write_uri(&buffer, &buffer_len, &str);
			else
			{
				unsigned char c;

				c = *buffer;

				if ((c == '@') && (buffer > saved_buffer))
				{
					/* A @ has been encountered, check if this belongs to an email adresse by traversing back
           * within the string */

					unsigned char *buffer2 = buffer - 1;
					unsigned char *buffer3;
					unsigned char c2;
					int buffer2_len = buffer_len + 1;
					char *address;

					while ((c2 = *buffer2) && buffer2 > saved_buffer)
					{
						static const char noaliaschars[] = {
							" ()<>@,;:\\\"[]\n\r"};

						if (strchr(noaliaschars,c2))
						{
							buffer2++;
							break;
						}
						buffer2_len++;
						buffer2--;
					}

					if ((buffer3 = (unsigned char*)parse_addr_spec((char*)buffer2, &address)))
					{
						int email_len;

						/* crop the string to the beginning of the email address */
						string_crop(&str,0,str.len - (buffer - buffer2));

						buffer_len += buffer - buffer2;
						buffer -= buffer - buffer2;
						email_len = buffer3 - buffer;
						buffer_len -= email_len;

						buffer = buffer3;

						sm_snprintf(buf,sizeof(buf),"<A HREF=\"mailto:%s\"%s>",address, user.config.read_link_underlined?"":" STYLE=\"TEXT-DECORATION: none\"");
						string_append(&str,buf);
						write_unicode(address,&str);
						string_append(&str,"</A>");
						free(address);
						continue;
					}
				}

		  	if (user.config.read_smilies)
		  	{
		  		int i;
		  		int smily_used = 0;
			  	/* No look into the smily table, this is slow and needs to be improved */
		  		for (i=0;i<sizeof(smily)/sizeof(struct smily);i++)
		  		{
		  			if (!strncmp(smily[i].ascii,(char*)buffer,strlen(smily[i].ascii)))
		  			{
		  				buffer += strlen(smily[i].ascii);
		  				buffer_len -= strlen(smily[i].ascii);
		  				sm_snprintf(buf,sizeof(buf),"<IMG SRC=\"PROGDIR:Images/%s\" VALIGN=\"middle\" ALT=\"%s\">",smily[i].gfx,smily[i].ascii);
		  				string_append(&str,buf);
		  				smily_used = 1;
		  			}
		  		}
		  		if (smily_used) continue;
		  	}

				if (!strncmp("\n<sb>",(char*)buffer,5))
				{
					if (line) string_append(&str,"<BR></TD><TD WIDTH=\"50%\"><HR></TD></TR></TABLE>");
					line = 1;
					buffer += 5;
					buffer_len -= 5;

					string_append(&str,"<TABLE WIDTH=\"100%\" BORDER=\"0\"><TR><TD VALIGN=\"middle\" WIDTH=\"50%\"><HR></TD><TD>");
					continue;
				}

				if (!strncmp("\n<tsb>",(char*)buffer,6))
				{
					if (line) string_append(&str,"<BR></TD><TD WIDTH=\"50%\"><HR></TD></TR></TABLE>");
					line = 2;
					buffer += 6;
					buffer_len -= 6;

					string_append(&str,"<TABLE WIDTH=\"100%\" BORDER=\"0\"><TR><TD VALIGN=\"middle\" WIDTH=\"50%\"><HR></TD><TD>");
					continue;
				}

				if (c < 128)
				{
					buffer++;
					buffer_len--;
					if (c== '<') string_append(&str,"&lt;");
					else if (c== '>') string_append(&str,"&gt;");
					else if (c== '&') string_append(&str,"&amp;");
					else if (c == 10)
					{
						eval_color = 1;
						string_append(&str,"<BR>\n");
						if (line)
						{
							string_append(&str,"</TD><TD WIDTH=\"50%\"><HR></TD></TR></TABLE>");
							line = 0;
						}
					} else
					{
						if (c == 32) {
							if (*buffer == 32 || (flags & TEXT2HTML_NOWRAP)) string_append(&str,"&nbsp;");
							else string_append(&str," ");
						} else {
						  if (c)
						  {
						  	string_append_part(&str,(char*)&c,1);
						  }
						}
					}
				} else
				{
					unsigned int unicode;
					int len = 0;
					/* check if it really could be a utf8 char */
					if (isLegalUTF8Sequence((utf8*)buffer, (utf8*)(buffer+buffer_len)))
					{
						len = utf8tochar((utf8*)buffer, &unicode, user.config.default_codeset);
					}
					if ((len == 0) || (len > buffer_len))
					{
						/* something wrong with that utf8 sequence */
						unicode = '?';
						len = 1;
					}
					buffer_len -= len;
					buffer += len;

					if (unicode == 0) unicode = '_';
					sm_snprintf(buf,sizeof(buf),"&#%d;",unicode);
					string_append(&str,buf);
				}
			}
		}

		if (fonttag) string_append(&str,"</FONT>");

		if (flags & TEXT2HTML_ENDBODY_TAG) string_append(&str,"</BODY>");

		SM_DEBUGF(20,("%s\n",str.str));

		return str.str;
	}
	return NULL;
}