Octstr *html_to_sms(Octstr *html) { long i, len; int c; Octstr *sms; sms = octstr_create(""); len = octstr_len(html); i = 0; while (i < len) { c = octstr_get_char(html, i); switch (c) { case '<': if (html_comment_begins(html, i)) skip_html_comment(html, &i); else skip_html_tag(html, &i); break; case '&': convert_html_entity(sms, html, &i); break; default: octstr_append_char(sms, c); ++i; break; } } octstr_shrink_blanks(sms); octstr_strip_blanks(sms); return sms; }
static void parse_html_decode( unsigned char *buf, const unsigned char *start, const unsigned char *end) { const unsigned char *p = start; unsigned char *out = buf; while (*p && p != end) { if (*p != '&') { *out++ = *p++; continue; } ++p; if (!*p || p == end) { *out++ = '&'; continue; } if (*p == '#' && p[1] == 'x') { const unsigned char *q = p; p += 2; while (isxdigit(*p)) ++p; int e = convert_html_entity(q + 2, p, 16); if (e >= 0) { out = ucs4_to_utf8_char(out, e); if (*p == ';') ++p; } else { *out++ = '&'; p = q; } } else if (*p == '#') { const unsigned char *q = p; ++p; while (isdigit(*p)) ++p; int e = convert_html_entity(q + 1, p, 10); if (e >= 0) { out = ucs4_to_utf8_char(out, e); if (*p == ';') ++p; } else { *out++ = '&'; p = q; } } else if (isalpha(*p)) { // entity expansion is not performed... const unsigned char *q = p; while (isalnum(*p)) ++p; int e = find_html_entity(q, p); if (e >= 0) { out = ucs4_to_utf8_char(out, e); if (*p == ';') ++p; } else { *out++ = '&'; p = q; } } else { *out++ = '&'; continue; } } *out = 0; }