static void HtmlEntities() { struct { const char *s; int rune; } entities[] = { { "Ü", 220 }, { "ü", 252 }, { "×", 215 }, { "Æ", 198 }, { "‌", 8204 }, { ":", 58 }, { "耏", 32783 }, { " ", 32 }, { "꼴", 44852 }, { "Ä", 196 }, { "&a3;", -1 }, { "&#xz312;", -1 }, { "&aer;", -1 } }; for (size_t i = 0; i < dimof(entities); i++ ) { const char *s = entities[i].s; int got; const char *entEnd = ResolveHtmlEntity(s + 1, str::Len(s) - 1, got); assert(got == entities[i].rune); assert((-1 == got) == !entEnd); } const char *unchanged[] = { "foo", "", " as;d " }; for (size_t i = 0; i < dimof(unchanged); i++) { const char *s = unchanged[i]; const char *res = ResolveHtmlEntities(s, s + str::Len(s), NULL); assert(res == s); } struct { const char *s; const char *res; } changed[] = { // implementation detail: if there is '&' in the string // we always allocate, even if it isn't a valid entity { "a&12", "a&12" }, { "a&x#30", "a&x#30" }, { " b", " b" }, { " ra", " ra" }, { "<", "<" }, { "a&  to end", "a& to end" }, { "  testä ;ö@Pgo", "\xC2\xA0 test\xC3\xA4 ;\xC3\xB6@Pgo" }, }; for (size_t i = 0; i < dimof(changed); i++) { const char *s = changed[i].s; const char *res = ResolveHtmlEntities(s, s + str::Len(s), NULL); assert(str::Eq(res, changed[i].res)); free((void*)res); } }
// convenience function for the above that always allocates char *ResolveHtmlEntities(const char *s, size_t len) { const char *tmp = ResolveHtmlEntities(s, s + len, NULL); if (tmp == s) return str::DupN(s, len); return (char *)tmp; }
// a text run is a string of consecutive text with uniform style void HtmlFormatter::EmitTextRun(const char* s, const char* end) { currReparseIdx = s - htmlParser->Start(); CrashIf(!ValidReparseIdx(currReparseIdx, htmlParser)); CrashIf(IsSpaceOnly(s, end) && !preFormatted); const char* tmp = ResolveHtmlEntities(s, end, textAllocator); bool resolved = tmp != s; if (resolved) { s = tmp; end = s + str::Len(s); } while (s < end) { // don't update the reparseIdx if s doesn't point into the original source if (!resolved) currReparseIdx = s - htmlParser->Start(); size_t strLen = str::Utf8ToWcharBuf(s, end - s, buf, dimof(buf)); // soft hyphens should not be displayed strLen -= str::RemoveChars(buf, L"\xad"); if (0 == strLen) break; textMeasure->SetFont(CurrFont()); RectF bbox = textMeasure->Measure(buf, strLen); EnsureDx(bbox.Width); if (bbox.Width <= pageDx - currX) { AppendInstr(DrawInstr::Str(s, end - s, bbox, dirRtl)); currX += bbox.Width; break; } size_t lenThatFits = StringLenForWidth(textMeasure, buf, strLen, pageDx - NewLineX()); // try to prevent a break in the middle of a word if (iswalnum(buf[lenThatFits])) { for (size_t len = lenThatFits; len > 0; len--) { if (!iswalnum(buf[len - 1])) { lenThatFits = len; break; } } } textMeasure->SetFont(CurrFont()); bbox = textMeasure->Measure(buf, lenThatFits); CrashIf(bbox.Width > pageDx); // s is UTF-8 and buf is UTF-16, so one // WCHAR doesn't always equal one char // TODO: this usually fails for non-BMP characters (i.e. hardly ever) for (size_t i = lenThatFits; i > 0; i--) { lenThatFits += buf[i - 1] < 0x80 ? 0 : buf[i - 1] < 0x800 ? 1 : 2; } AppendInstr(DrawInstr::Str(s, lenThatFits, bbox, dirRtl)); currX += bbox.Width; s += lenThatFits; } }
static WCHAR *ExtractHtmlText(EpubDoc *doc) { size_t len; const char *data = doc->GetTextData(&len); str::Str<char> text(len / 2); HtmlPullParser p(data, len); HtmlToken *t; Vec<HtmlTag> tagNesting; while ((t = p.Next()) != NULL && !t->IsError()) { if (t->IsText() && !tagNesting.Contains(Tag_Head) && !tagNesting.Contains(Tag_Script) && !tagNesting.Contains(Tag_Style)) { // trim whitespace (TODO: also normalize within text?) while (t->sLen > 0 && str::IsWs(t->s[0])) { t->s++; t->sLen--; } while (t->sLen > 0 && str::IsWs(t->s[t->sLen-1])) t->sLen--; if (t->sLen > 0) { text.AppendAndFree(ResolveHtmlEntities(t->s, t->sLen)); text.Append(' '); } } else if (t->IsStartTag()) { // TODO: force-close tags similar to HtmlFormatter.cpp's AutoCloseOnOpen? if (!IsTagSelfClosing(t->tag)) tagNesting.Append(t->tag); } else if (t->IsEndTag()) { if (!IsInlineTag(t->tag) && text.Size() > 0 && text.Last() == ' ') { text.Pop(); text.Append("\r\n"); } // when closing a tag, if the top tag doesn't match but // there are only potentially self-closing tags on the // stack between the matching tag, we pop all of them if (tagNesting.Contains(t->tag)) { while (tagNesting.Last() != t->tag) tagNesting.Pop(); } if (tagNesting.Count() > 0 && tagNesting.Last() == t->tag) tagNesting.Pop(); } } return str::conv::FromUtf8(text.Get()); }
void EbookController::OnClickedLink(int pageNo, DrawInstr *link) { ScopedMem<WCHAR> url(str::conv::FromHtmlUtf8(link->str.s, link->str.len)); if (url::IsAbsolute(url)) { EbookTocDest dest(nullptr, url); cb->GotoLink(&dest); return; } if (Doc_Epub == doc.Type() && pages && (size_t)pageNo <= pages->Count()) { // normalize the URL by combining it with the chapter's base path for (int j = pageNo; j > 0; j--) { HtmlPage *p = pages->At(j - 1); // <pagebreak src="..." page_marker /> is usually the second instruction on a page for (size_t k = 0; k < std::min((size_t)2, p->instructions.Count()); k++) { DrawInstr& di = p->instructions.At(k); if (InstrAnchor == di.type && str::StartsWith(di.str.s + di.str.len, "\" page_marker />")) { ScopedMem<char> basePath(str::DupN(di.str.s, di.str.len)); ScopedMem<char> relPath(ResolveHtmlEntities(link->str.s, link->str.len)); ScopedMem<char> absPath(NormalizeURL(relPath, basePath)); url.Set(str::conv::FromUtf8(absPath)); j = 0; // done break; } } } } int idx = ResolvePageAnchor(url); if (-1 == idx && str::FindChar(url, '%')) { url::DecodeInPlace(url); idx = ResolvePageAnchor(url); } if (idx != -1) { EbookTocDest dest(nullptr, idx); cb->GotoLink(&dest); } }