static gchar * url_extract (const guchar **text, gboolean full_url) { const guchar *end = *text, *p; gchar *out; while (*end && is_url_char (*end)) end++; /* Back up if we probably went too far. */ while (end > *text && is_trailing_garbage (*(end - 1))) end--; if (full_url) { /* Make sure this really looks like a URL. */ p = memchr (*text, ':', end - *text); if (!p || end - p < 4) return NULL; } else { /* Make sure this really looks like a hostname. */ p = memchr (*text, '.', end - *text); if (!p || p >= end - 2) return NULL; p = memchr (p + 2, '.', end - (p + 2)); if (!p || p >= end - 2) return NULL; } out = g_strndup ((gchar *) * text, end - *text); *text = end; return out; }
// // HTMLエスケープ // // include_url : URL中でもエスケープする( デフォルト = true ) // const std::string MISC::html_escape( const std::string& str, const bool include_url ) { if( str.empty() ) return str; bool is_url = false; int scheme = SCHEME_NONE; std::string str_out; const size_t str_length = str.length(); for( size_t pos = 0; pos < str_length; ++pos ) { char tmpchar = str.c_str()[ pos ]; // URL中はエスケープしない場合 if( ! include_url ) { // URLとして扱うかどうか // エスケープには影響がないので loose_url としておく if( scheme != SCHEME_NONE ) is_url = is_url_char( str.c_str() + pos, true ); // URLスキームが含まれているか判別 int len = 0; if( ! is_url ) scheme = is_url_scheme( str.c_str() + pos, &len ); // URLスキームが含まれていた場合は文字数分進めてループに戻る if( len > 0 ) { str_out += str.substr( pos, len ); pos += len - 1; // あとで ++pos される分を引く continue; } } // include_url = false でURL中ならエスケープしない if( is_url ) str_out += tmpchar; else if( tmpchar == '&' ) { const int bufsize = 64; char out_char[ bufsize ]; int n_in, n_out; const int type = DBTREE::decode_char( str.c_str() + pos, n_in, out_char, n_out, false ); if( type == DBTREE::NODE_NONE ) str_out += "&"; else str_out += tmpchar; } else if( tmpchar == '\"' ) str_out += """; else if( tmpchar == '<' ) str_out += "<"; else if( tmpchar == '>' ) str_out += ">"; else str_out += tmpchar; } #ifdef _DEBUG if( str != str_out ){ std::cout << "MISC::html_escape\nstr = " << str << std::endl << "out = " << str_out << std::endl; } #endif return str_out; }
/** * e_text_to_html_full: * @input: a NUL-terminated input buffer * @flags: some combination of the E_TEXT_TO_HTML_* flags defined * in e-html-utils.h * @color: color for citation highlighting * * This takes a buffer of text as input and produces a buffer of * "equivalent" HTML, subject to certain transformation rules. * * The set of possible flags is: * * - E_TEXT_TO_HTML_PRE: wrap the output HTML in <PRE> and * </PRE> Should only be used if @input is the entire * buffer to be converted. If e_text_to_html is being called with * small pieces of data, you should wrap the entire result in * <PRE> yourself. * * - E_TEXT_TO_HTML_CONVERT_NL: convert "\n" to "<BR>n" on * output. (Should not be used with E_TEXT_TO_HTML_PRE, since * that would result in double-newlines.) * * - E_TEXT_TO_HTML_CONVERT_SPACES: convert a block of N spaces * into N-1 non-breaking spaces and one normal space. A space * at the start of the buffer is always converted to a * non-breaking space, regardless of the following character, * which probably means you don't want to use this flag on * pieces of data that aren't delimited by at least line breaks. * * If E_TEXT_TO_HTML_CONVERT_NL and E_TEXT_TO_HTML_CONVERT_SPACES * are both defined, then TABs will also be converted to spaces. * * - E_TEXT_TO_HTML_CONVERT_URLS: wrap <a href="..."> </a> * around strings that look like URLs. * * - E_TEXT_TO_HTML_CONVERT_ADDRESSES: wrap <a href="mailto:..."> * </a> around strings that look like mail addresses. * * - E_TEXT_TO_HTML_MARK_CITATION: wrap <font color="..."> * </font> around citations (lines beginning with "> ", etc). * * - E_TEXT_TO_HTML_ESCAPE_8BIT: flatten everything to US-ASCII * * - E_TEXT_TO_HTML_CITE: quote the text with "> " at the start of each * line. * * Returns: a newly-allocated string containing HTML **/ gchar * e_text_to_html_full (const gchar *input, guint flags, guint32 color) { const guchar *cur, *next, *linestart; gchar *buffer = NULL; gchar *out = NULL; gint buffer_size = 0, col; gboolean colored = FALSE, saw_citation = FALSE; /* Allocate a translation buffer. */ buffer_size = strlen (input) * 2 + 5; buffer = g_malloc (buffer_size); out = buffer; if (flags & E_TEXT_TO_HTML_PRE) out += sprintf (out, "<PRE>"); col = 0; for (cur = linestart = (const guchar *) input; cur && *cur; cur = next) { gunichar u; if (flags & E_TEXT_TO_HTML_MARK_CITATION && col == 0) { saw_citation = is_citation (cur, saw_citation); if (saw_citation) { if (!colored) { gchar font[25]; g_snprintf (font, 25, "<FONT COLOR=\"#%06x\">", color); out = check_size (&buffer, &buffer_size, out, 25); out += sprintf (out, "%s", font); colored = TRUE; } } else if (colored) { const gchar *no_font = "</FONT>"; out = check_size (&buffer, &buffer_size, out, 9); out += sprintf (out, "%s", no_font); colored = FALSE; } /* Display mbox-mangled ">From" as "From" */ if (*cur == '>' && !saw_citation) cur++; } else if (flags & E_TEXT_TO_HTML_CITE && col == 0) { out = check_size (&buffer, &buffer_size, out, 5); out += sprintf (out, "> "); } u = g_utf8_get_char ((gchar *) cur); if (g_unichar_isalpha (u) && (flags & E_TEXT_TO_HTML_CONVERT_URLS)) { gchar *tmpurl = NULL, *refurl = NULL, *dispurl = NULL; if (!g_ascii_strncasecmp ((gchar *)cur, "http://", 7) || !g_ascii_strncasecmp ((gchar *)cur, "https://", 8) || !g_ascii_strncasecmp ((gchar *)cur, "ftp://", 6) || !g_ascii_strncasecmp ((gchar *)cur, "nntp://", 7) || !g_ascii_strncasecmp ((gchar *)cur, "mailto:", 7) || !g_ascii_strncasecmp ((gchar *)cur, "news:", 5) || !g_ascii_strncasecmp ((gchar *)cur, "file:", 5) || !g_ascii_strncasecmp ((gchar *)cur, "callto:", 7) || !g_ascii_strncasecmp ((gchar *)cur, "h323:", 5) || !g_ascii_strncasecmp ((gchar *)cur, "sip:", 4) || !g_ascii_strncasecmp ((gchar *)cur, "webcal:", 7)) { tmpurl = url_extract (&cur, TRUE); if (tmpurl) { refurl = e_text_to_html (tmpurl, 0); dispurl = g_strdup (refurl); } } else if (!g_ascii_strncasecmp ((gchar *)cur, "www.", 4) && is_url_char (*(cur + 4))) { tmpurl = url_extract (&cur, FALSE); if (tmpurl) { dispurl = e_text_to_html (tmpurl, 0); refurl = g_strdup_printf ("http://%s", dispurl); } } if (tmpurl) { out = check_size (&buffer, &buffer_size, out, strlen (refurl) + strlen (dispurl) + 15); out += sprintf (out, "<a href=\"%s\">%s</a>", refurl, dispurl); col += strlen (tmpurl); g_free (tmpurl); g_free (refurl); g_free (dispurl); } if (!*cur) break; u = g_utf8_get_char ((gchar *) cur); } if (u == '@' && (flags & E_TEXT_TO_HTML_CONVERT_ADDRESSES)) { gchar *addr, *dispaddr, *outaddr; addr = email_address_extract (&cur, &out, linestart); if (addr) { dispaddr = e_text_to_html (addr, 0); outaddr = g_strdup_printf ("<a href=\"mailto:%s\">%s</a>", addr, dispaddr); out = check_size (&buffer, &buffer_size, out, strlen (outaddr)); out += sprintf (out, "%s", outaddr); col += strlen (addr); g_free (addr); g_free (dispaddr); g_free (outaddr); if (!*cur) break; u = g_utf8_get_char ((gchar *) cur); } } if (!g_unichar_validate (u)) { /* Sigh. Someone sent undeclared 8-bit data. * Assume it's iso-8859-1. */ u = *cur; next = cur + 1; } else next = (const guchar *) g_utf8_next_char (cur); out = check_size (&buffer, &buffer_size, out, 10); switch (u) { case '<': strcpy (out, "<"); out += 4; col++; break; case '>': strcpy (out, ">"); out += 4; col++; break; case '&': strcpy (out, "&"); out += 5; col++; break; case '"': strcpy (out, """); out += 6; col++; break; case '\n': if (flags & E_TEXT_TO_HTML_CONVERT_NL) { strcpy (out, "<br>"); out += 4; } *out++ = *cur; linestart = cur; col = 0; break; case '\t': if (flags & (E_TEXT_TO_HTML_CONVERT_SPACES | E_TEXT_TO_HTML_CONVERT_NL)) { do { out = check_size (&buffer, &buffer_size, out, 7); strcpy (out, " "); out += 6; col++; } while (col % 8); break; } /* otherwise, FALL THROUGH */ case ' ': if (flags & E_TEXT_TO_HTML_CONVERT_SPACES) { if (cur == (const guchar *) input || *(cur + 1) == ' ' || *(cur + 1) == '\t' || *(cur - 1) == '\n') { strcpy (out, " "); out += 6; col++; break; } } /* otherwise, FALL THROUGH */ default: if ((u >= 0x20 && u < 0x80) || (u == '\r' || u == '\t')) { /* Default case, just copy. */ *out++ = u; } else { if (flags & E_TEXT_TO_HTML_ESCAPE_8BIT) *out++ = '?'; else out += g_snprintf(out, 9, "&#%d;", u); } col++; break; } } out = check_size (&buffer, &buffer_size, out, 7); if (flags & E_TEXT_TO_HTML_PRE) strcpy (out, "</PRE>"); else *out = '\0'; return buffer; }