static void Test01() { utassert(IsInlineTag(Tag_A)); utassert(IsInlineTag(Tag_U)); utassert(IsInlineTag(Tag_Span)); utassert(!IsInlineTag(Tag_P)); utassert(IsTagSelfClosing(Tag_Area)); utassert(IsTagSelfClosing(Tag_Link)); utassert(IsTagSelfClosing(Tag_Param)); utassert(!IsTagSelfClosing(Tag_P)); }
void HtmlFormatter::UpdateTagNesting(HtmlToken* t) { CrashIf(!t->IsTag()); if (keepTagNesting || Tag_NotFound == t->tag || t->IsEmptyElementEndTag() || IsTagSelfClosing(t->tag)) { return; } size_t idx = tagNesting.size(); bool isInline = IsInlineTag(t->tag); if (t->IsStartTag()) { if (IsInlineTag(t->tag)) { tagNesting.Push(t->tag); return; } // close all tags that can't contain this new block-level tag for (; idx > 0 && AutoCloseOnOpen(t->tag, tagNesting.at(idx - 1)); idx--) ; } else { // close all tags that were contained within the current tag // (for inline tags just up to the next block-level tag) for (; idx > 0 && (!isInline || IsInlineTag(tagNesting.at(idx - 1))) && t->tag != tagNesting.at(idx - 1); idx--) ; if (0 == idx || tagNesting.at(idx - 1) != t->tag) return; } AutoCloseTags(tagNesting.size() - idx); if (t->IsStartTag()) tagNesting.Push(t->tag); else { CrashIf(!t->IsEndTag() || t->tag != tagNesting.Last()); tagNesting.Pop(); } }
// Parse s in place i.e. we assume we can modify it. Must be 0-terminated. // The caller owns the memory for s. HtmlElement *HtmlParser::ParseInPlace(char *s, UINT codepage) { if (this->html) Reset(); this->html = s; this->codepage = codepage; HtmlPullParser parser(s, strlen(s)); HtmlToken *tok; while ((tok = parser.Next())) { char *tag = (char *)tok->s; if (tok->IsError()) { errorContext = tag; switch (tok->error) { case HtmlToken::UnclosedTag: return ParseError(ErrParsingElementName); case HtmlToken::InvalidTag: return ParseError(ErrParsingClosingElement); default: return ParseError(ErrParsingElement); } } if (!tok->IsTag()) { // ignore text content assert(tok->IsText()); continue; } char *tagEnd = tag + tok->nLen; if (!tok->IsEndTag()) { // note: call tok->NextAttr() before zero-terminating names and values AttrInfo *attr = tok->NextAttr(); *tagEnd = '\0'; StartTag(tag); while (attr) { char *name = (char *)attr->name; char *nameEnd = name + attr->nameLen; char *value = (char *)attr->val; char *valueEnd = value + attr->valLen; attr = tok->NextAttr(); *nameEnd = *valueEnd = '\0'; AppendAttr(name, value); } } if (!tok->IsStartTag() || IsTagSelfClosing(tok->tag)) { *tagEnd = '\0'; CloseTag(tag); } } return rootElement; }
static WCHAR *ExtractHtmlText(EpubDoc *doc) { size_t len; const char *data = doc->GetTextData(&len); str::Str<char> text(len / 2); HtmlPullParser p(data, len); HtmlToken *t; Vec<HtmlTag> tagNesting; while ((t = p.Next()) != NULL && !t->IsError()) { if (t->IsText() && !tagNesting.Contains(Tag_Head) && !tagNesting.Contains(Tag_Script) && !tagNesting.Contains(Tag_Style)) { // trim whitespace (TODO: also normalize within text?) while (t->sLen > 0 && str::IsWs(t->s[0])) { t->s++; t->sLen--; } while (t->sLen > 0 && str::IsWs(t->s[t->sLen-1])) t->sLen--; if (t->sLen > 0) { text.AppendAndFree(ResolveHtmlEntities(t->s, t->sLen)); text.Append(' '); } } else if (t->IsStartTag()) { // TODO: force-close tags similar to HtmlFormatter.cpp's AutoCloseOnOpen? if (!IsTagSelfClosing(t->tag)) tagNesting.Append(t->tag); } else if (t->IsEndTag()) { if (!IsInlineTag(t->tag) && text.Size() > 0 && text.Last() == ' ') { text.Pop(); text.Append("\r\n"); } // when closing a tag, if the top tag doesn't match but // there are only potentially self-closing tags on the // stack between the matching tag, we pop all of them if (tagNesting.Contains(t->tag)) { while (tagNesting.Last() != t->tag) tagNesting.Pop(); } if (tagNesting.Count() > 0 && tagNesting.Last() == t->tag) tagNesting.Pop(); } } return str::conv::FromUtf8(text.Get()); }
// record the tag for the purpose of building current state // of html tree static void RecordStartTag(Vec<HtmlTag>* tagNesting, HtmlTag tag) { if (!IsTagSelfClosing(tag)) tagNesting->Append(tag); }