wxString wxHtmlEntitiesParser::Parse(const wxString& input) const { wxString output; const wxString::const_iterator end(input.end()); wxString::const_iterator c(input.begin()); wxString::const_iterator last(c); for ( ; c < end; ++c ) { if (*c == wxT('&')) { if ( output.empty() ) output.reserve(input.length()); if (c - last > 0) output.append(last, c); if ( ++c == end ) break; wxString entity; const wxString::const_iterator ent_s = c; wxChar entity_char; for ( ; c != end; ++c ) { wxChar ch = *c; if ( !((ch >= wxT('a') && ch <= wxT('z')) || (ch >= wxT('A') && ch <= wxT('Z')) || (ch >= wxT('0') && ch <= wxT('9')) || ch == wxT('_') || ch == wxT('#')) ) break; } entity.append(ent_s, c); if (c == end || *c != wxT(';')) --c; last = c+1; entity_char = GetEntityChar(entity); if (entity_char) output << entity_char; else { output.append(ent_s-1, c+1); wxLogTrace(wxTRACE_HTML_DEBUG, "Unrecognized HTML entity: '%s'", entity); } } } if ( last == input.begin() ) // common case: no entity return input; if ( last != end ) output.append(last, end); return output; }
wxString wx28HtmlEntitiesParser::Parse(const wxString& input) { const wxChar *c, *last; const wxChar *in_str = input.c_str(); wxString output; for (c = in_str, last = in_str; *c != wxT('\0'); c++) { if (*c == wxT('&')) { if ( output.empty() ) output.reserve(input.length()); if (c - last > 0) output.append(last, c - last); if ( *++c == wxT('\0') ) break; wxString entity; const wxChar *ent_s = c; wxChar entity_char; for (; (*c >= wxT('a') && *c <= wxT('z')) || (*c >= wxT('A') && *c <= wxT('Z')) || (*c >= wxT('0') && *c <= wxT('9')) || *c == wxT('_') || *c == wxT('#'); c++) {} entity.append(ent_s, c - ent_s); if (*c != wxT(';')) c--; last = c+1; entity_char = GetEntityChar(entity); if (entity_char) output << entity_char; else { output.append(ent_s-1, c-ent_s+2); wxLogTrace(wxTRACE_HTML_DEBUG, wxT("Unrecognized HTML entity: '%s'"), entity.c_str()); } } } if (last == in_str) // common case: no entity return input; if (*last != wxT('\0')) output.append(last); return output; }
void Decode(std::wstring& str, int opt) { // (opt <= 0 || opt > 3) : Do nothing. // (opt == 1) : Decode both numeric character references and character entity references. // (opt == 2) : Decode only numeric character references. // (opt == 3) : Decode only character entity references. if (opt >= 1 && opt <= 3) { std::wstring::size_type start = 0; while ((start = str.find(L'&', start)) != std::wstring::npos) { std::wstring::size_type end, pos; if ((end = str.find(L';', start)) == std::wstring::npos) break; pos = start + 1; if (pos == end) // &; - skip { start = end + 1; continue; } else if ((end - pos) > 10) // name (or number) is too long { ++start; continue; } if (str[pos] == L'#') // Numeric character reference { if (opt == 3 || // Decode only character entity references, ++pos == end) // &#; - skip { start = end + 1; continue; } int base; if (str[pos] == L'x' || str[pos] == L'X') { if (++pos == end) // &#x; or &#X; - skip { start = end + 1; continue; } base = 16; } else { base = 10; } std::wstring num(str, pos, end - pos); WCHAR* pch = nullptr; errno = 0; long ch = wcstol(num.c_str(), &pch, base); if (pch == nullptr || *pch != L'\0' || errno == ERANGE || ch <= 0 || ch >= 0xFFFE) // invalid character { start = pos; continue; } str.replace(start, end - start + 1, 1, (WCHAR)ch); ++start; } else // Character entity reference { if (opt == 2) // Decode only numeric character references - skip { start = end + 1; continue; } std::wstring name(str, pos, end - pos); WCHAR ch = GetEntityChar(name); if (ch) { str.replace(start, end - start + 1, 1, ch); } ++start; } } } }