// caller needs to free() the result WCHAR *DecodeHtmlEntitites(const char *string, UINT codepage) { WCHAR *fixed = str::conv::FromCodePage(string, codepage), *dst = fixed; const WCHAR *src = fixed; while (*src) { if (*src != '&') { *dst++ = *src++; continue; } src++; // numeric entities int unicode; if (str::Parse(src, L"#%d;", &unicode) || str::Parse(src, L"#x%x;", &unicode)) { *dst++ = IntToChar(unicode); src = str::FindChar(src, ';') + 1; continue; } // named entities int rune = -1; const WCHAR *entityEnd = src; while (iswalnum(*entityEnd)) entityEnd++; if (entityEnd != src) { size_t entityLen = entityEnd - src; rune = HtmlEntityNameToRune(src, entityLen); } if (-1 != rune) { *dst++ = IntToChar(rune); src = entityEnd; if (*src == ';') ++src; } else { *dst++ = '&'; } } *dst = '\0'; return fixed; }
// if "&foo;" was the entity, s points at the char // after '&' and len is the maximum lenght of the string // (4 in case of "foo;") // returns a pointer to the first character after the entity const char *ResolveHtmlEntity(const char *s, size_t len, int& rune) { const char *entEnd = str::Parse(s, len, "#%d%?;", &rune); if (entEnd) return entEnd; entEnd = str::Parse(s, len, "#x%x%?;", &rune); if (entEnd) return entEnd; // go to the end of a potential named entity for (entEnd = s; entEnd < s + len && isalnum((unsigned char)*entEnd); entEnd++); if (entEnd != s) { rune = HtmlEntityNameToRune(s, entEnd - s); if (-1 == rune) return NULL; // skip the trailing colon - if there is one if (entEnd < s + len && *entEnd == ';') entEnd++; return entEnd; } rune = -1; return NULL; }