static void Test03() { const char* s = "a < b > c <> d <"; HtmlPullParser parser(s, str::Len(s)); HtmlToken* t = parser.Next(); utassert(t && t->IsText() && str::EqNIx(t->s, t->sLen, "a ")); t = parser.Next(); utassert(t && t->IsText() && str::EqNIx(t->s, t->sLen, "< b > c ")); t = parser.Next(); utassert(t && t->IsText() && str::EqNIx(t->s, t->sLen, "<> d ")); t = parser.Next(); utassert(t && t->IsError() && HtmlToken::UnclosedTag == t->error); t = parser.Next(); utassert(!t); }
static char *GetTextContent(HtmlPullParser& parser) { HtmlToken *tok = parser.Next(); if (!tok || !tok->IsText()) return NULL; return ResolveHtmlEntities(tok->s, tok->sLen); }
DocTocItem *MobiEngineImpl::GetTocTree() { if (!tocReparsePoint) return NULL; EbookTocItem *root = NULL; ScopedMem<WCHAR> itemText; ScopedMem<WCHAR> itemLink; int itemLevel = 0; int idCounter = 0; // there doesn't seem to be a standard for Mobi ToCs, so we try to // determine the author's intentions by looking at commonly used tags HtmlPullParser parser(tocReparsePoint, str::Len(tocReparsePoint)); HtmlToken *tok; while ((tok = parser.Next()) && !tok->IsError()) { if (itemLink && tok->IsText()) { ScopedMem<WCHAR> linkText(str::conv::FromHtmlUtf8(tok->s, tok->sLen)); if (itemText) itemText.Set(str::Join(itemText, L" ", linkText)); else itemText.Set(linkText.StealData()); } else if (!tok->IsTag()) continue; else if (Tag_Mbp_Pagebreak == tok->tag) break; else if (!itemLink && tok->IsStartTag() && Tag_A == tok->tag) { AttrInfo *attr = tok->GetAttrByName("filepos"); if (!attr) attr = tok->GetAttrByName("href"); if (attr) itemLink.Set(str::conv::FromHtmlUtf8(attr->val, attr->valLen)); } else if (itemLink && tok->IsEndTag() && Tag_A == tok->tag) { PageDestination *dest = NULL; if (!itemText) { itemLink.Set(NULL); continue; } if (IsExternalUrl(itemLink)) dest = new SimpleDest2(0, RectD(), itemLink.StealData()); else dest = GetNamedDest(itemLink); EbookTocItem *item = new EbookTocItem(itemText.StealData(), dest); item->id = ++idCounter; item->open = itemLevel <= 2; AppendTocItem(root, item, itemLevel); itemLink.Set(NULL); } else if (Tag_Blockquote == tok->tag || Tag_Ul == tok->tag || Tag_Ol == tok->tag) { if (tok->IsStartTag()) itemLevel++; else if (tok->IsEndTag() && itemLevel > 0) itemLevel--; } } return root; }
static void Test02() { const char* s = "<p>Last paragraph"; HtmlPullParser parser(s, str::Len(s)); HtmlToken* t = parser.Next(); utassert(t && t->IsTag() && t->IsStartTag() && Tag_P == t->tag); t = parser.Next(); utassert(t && t->IsText() && str::EqNIx(t->s, t->sLen, "Last paragraph")); }
// Parse s in place i.e. we assume we can modify it. Must be 0-terminated. // The caller owns the memory for s. HtmlElement *HtmlParser::ParseInPlace(char *s, UINT codepage) { if (this->html) Reset(); this->html = s; this->codepage = codepage; HtmlPullParser parser(s, strlen(s)); HtmlToken *tok; while ((tok = parser.Next())) { char *tag = (char *)tok->s; if (tok->IsError()) { errorContext = tag; switch (tok->error) { case HtmlToken::UnclosedTag: return ParseError(ErrParsingElementName); case HtmlToken::InvalidTag: return ParseError(ErrParsingClosingElement); default: return ParseError(ErrParsingElement); } } if (!tok->IsTag()) { // ignore text content assert(tok->IsText()); continue; } char *tagEnd = tag + tok->nLen; if (!tok->IsEndTag()) { // note: call tok->NextAttr() before zero-terminating names and values AttrInfo *attr = tok->NextAttr(); *tagEnd = '\0'; StartTag(tag); while (attr) { char *name = (char *)attr->name; char *nameEnd = name + attr->nameLen; char *value = (char *)attr->val; char *valueEnd = value + attr->valLen; attr = tok->NextAttr(); *nameEnd = *valueEnd = '\0'; AppendAttr(name, value); } } if (!tok->IsStartTag() || IsTagSelfClosing(tok->tag)) { *tagEnd = '\0'; CloseTag(tag); } } return rootElement; }
static WCHAR *ExtractHtmlText(EpubDoc *doc) { size_t len; const char *data = doc->GetTextData(&len); str::Str<char> text(len / 2); HtmlPullParser p(data, len); HtmlToken *t; Vec<HtmlTag> tagNesting; while ((t = p.Next()) != NULL && !t->IsError()) { if (t->IsText() && !tagNesting.Contains(Tag_Head) && !tagNesting.Contains(Tag_Script) && !tagNesting.Contains(Tag_Style)) { // trim whitespace (TODO: also normalize within text?) while (t->sLen > 0 && str::IsWs(t->s[0])) { t->s++; t->sLen--; } while (t->sLen > 0 && str::IsWs(t->s[t->sLen-1])) t->sLen--; if (t->sLen > 0) { text.AppendAndFree(ResolveHtmlEntities(t->s, t->sLen)); text.Append(' '); } } else if (t->IsStartTag()) { // TODO: force-close tags similar to HtmlFormatter.cpp's AutoCloseOnOpen? if (!IsTagSelfClosing(t->tag)) tagNesting.Append(t->tag); } else if (t->IsEndTag()) { if (!IsInlineTag(t->tag) && text.Size() > 0 && text.Last() == ' ') { text.Pop(); text.Append("\r\n"); } // when closing a tag, if the top tag doesn't match but // there are only potentially self-closing tags on the // stack between the matching tag, we pop all of them if (tagNesting.Contains(t->tag)) { while (tagNesting.Last() != t->tag) tagNesting.Pop(); } if (tagNesting.Count() > 0 && tagNesting.Last() == t->tag) tagNesting.Pop(); } } return str::conv::FromUtf8(text.Get()); }
DocTocItem *Fb2EngineImpl::GetTocTree() { EbookTocItem *root = NULL; ScopedMem<WCHAR> itemText; int titleCount = 0; bool inTitle = false; int level = 0; size_t xmlLen; const char *xmlData = doc->GetTextData(&xmlLen); HtmlPullParser parser(xmlData, xmlLen); HtmlToken *tok; while ((tok = parser.Next()) && !tok->IsError()) { if (tok->IsStartTag() && Tag_Section == tok->tag) level++; else if (tok->IsEndTag() && Tag_Section == tok->tag && level > 0) level--; else if (tok->IsStartTag() && Tag_Title == tok->tag) { inTitle = true; titleCount++; } else if (tok->IsEndTag() && Tag_Title == tok->tag) { if (itemText) str::NormalizeWS(itemText); if (!str::IsEmpty(itemText.Get())) { ScopedMem<WCHAR> name(str::Format(TEXT(FB2_TOC_ENTRY_MARK) L"%d", titleCount)); PageDestination *dest = GetNamedDest(name); EbookTocItem *item = new EbookTocItem(itemText.StealData(), dest); item->id = titleCount; item->open = level <= 2; AppendTocItem(root, item, level); } inTitle = false; } else if (inTitle && tok->IsText()) { ScopedMem<WCHAR> text(str::conv::FromHtmlUtf8(tok->s, tok->sLen)); if (str::IsEmpty(itemText.Get())) itemText.Set(text.StealData()); else itemText.Set(str::Join(itemText, L" ", text)); } } return root; }