// returns true if prev can't contain curr and should thus be closed static bool AutoCloseOnOpen(HtmlTag curr, HtmlTag prev) { CrashIf(IsInlineTag(curr)); // always start afresh for a new <body> if (Tag_Body == curr) return true; // allow <div>s to be contained within inline tags // (e.g. <i><div>...</div></i> from pg12.mobi) if (Tag_Div == curr) return false; switch (prev) { case Tag_Dd: case Tag_Dt: return Tag_Dd == curr || Tag_Dt == curr; case Tag_H1: case Tag_H2: case Tag_H3: case Tag_H4: case Tag_H5: case Tag_H6: return IsTagH(curr); case Tag_Lh: case Tag_Li: return Tag_Lh == curr || Tag_Li == curr; case Tag_P: return true; // <p> can't contain any block-level elements case Tag_Td: case Tag_Tr: return Tag_Tr == curr; default: return IsInlineTag(prev); } }
void HtmlFormatter::UpdateTagNesting(HtmlToken* t) { CrashIf(!t->IsTag()); if (keepTagNesting || Tag_NotFound == t->tag || t->IsEmptyElementEndTag() || IsTagSelfClosing(t->tag)) { return; } size_t idx = tagNesting.size(); bool isInline = IsInlineTag(t->tag); if (t->IsStartTag()) { if (IsInlineTag(t->tag)) { tagNesting.Push(t->tag); return; } // close all tags that can't contain this new block-level tag for (; idx > 0 && AutoCloseOnOpen(t->tag, tagNesting.at(idx - 1)); idx--) ; } else { // close all tags that were contained within the current tag // (for inline tags just up to the next block-level tag) for (; idx > 0 && (!isInline || IsInlineTag(tagNesting.at(idx - 1))) && t->tag != tagNesting.at(idx - 1); idx--) ; if (0 == idx || tagNesting.at(idx - 1) != t->tag) return; } AutoCloseTags(tagNesting.size() - idx); if (t->IsStartTag()) tagNesting.Push(t->tag); else { CrashIf(!t->IsEndTag() || t->tag != tagNesting.Last()); tagNesting.Pop(); } }
static void Test01() { utassert(IsInlineTag(Tag_A)); utassert(IsInlineTag(Tag_U)); utassert(IsInlineTag(Tag_Span)); utassert(!IsInlineTag(Tag_P)); utassert(IsTagSelfClosing(Tag_Area)); utassert(IsTagSelfClosing(Tag_Link)); utassert(IsTagSelfClosing(Tag_Param)); utassert(!IsTagSelfClosing(Tag_P)); }
void HtmlFormatter::HandleDirAttr(HtmlToken* t) { // only apply reading direction changes to block elements (for now) if (t->IsStartTag() && !IsInlineTag(t->tag)) { AttrInfo* attr = t->GetAttrByName("dir"); if (attr) dirRtl = CurrStyle()->dirRtl = attr->ValIs("RTL"); } }
static WCHAR *ExtractHtmlText(EpubDoc *doc) { size_t len; const char *data = doc->GetTextData(&len); str::Str<char> text(len / 2); HtmlPullParser p(data, len); HtmlToken *t; Vec<HtmlTag> tagNesting; while ((t = p.Next()) != NULL && !t->IsError()) { if (t->IsText() && !tagNesting.Contains(Tag_Head) && !tagNesting.Contains(Tag_Script) && !tagNesting.Contains(Tag_Style)) { // trim whitespace (TODO: also normalize within text?) while (t->sLen > 0 && str::IsWs(t->s[0])) { t->s++; t->sLen--; } while (t->sLen > 0 && str::IsWs(t->s[t->sLen-1])) t->sLen--; if (t->sLen > 0) { text.AppendAndFree(ResolveHtmlEntities(t->s, t->sLen)); text.Append(' '); } } else if (t->IsStartTag()) { // TODO: force-close tags similar to HtmlFormatter.cpp's AutoCloseOnOpen? if (!IsTagSelfClosing(t->tag)) tagNesting.Append(t->tag); } else if (t->IsEndTag()) { if (!IsInlineTag(t->tag) && text.Size() > 0 && text.Last() == ' ') { text.Pop(); text.Append("\r\n"); } // when closing a tag, if the top tag doesn't match but // there are only potentially self-closing tags on the // stack between the matching tag, we pop all of them if (tagNesting.Contains(t->tag)) { while (tagNesting.Last() != t->tag) tagNesting.Pop(); } if (tagNesting.Count() > 0 && tagNesting.Last() == t->tag) tagNesting.Pop(); } } return str::conv::FromUtf8(text.Get()); }