static char *GetTextContent(HtmlPullParser& parser) { HtmlToken *tok = parser.Next(); if (!tok || !tok->IsText()) return NULL; return ResolveHtmlEntities(tok->s, tok->sLen); }
DocTocItem *MobiEngineImpl::GetTocTree() { if (!tocReparsePoint) return NULL; EbookTocItem *root = NULL; ScopedMem<WCHAR> itemText; ScopedMem<WCHAR> itemLink; int itemLevel = 0; int idCounter = 0; // there doesn't seem to be a standard for Mobi ToCs, so we try to // determine the author's intentions by looking at commonly used tags HtmlPullParser parser(tocReparsePoint, str::Len(tocReparsePoint)); HtmlToken *tok; while ((tok = parser.Next()) && !tok->IsError()) { if (itemLink && tok->IsText()) { ScopedMem<WCHAR> linkText(str::conv::FromHtmlUtf8(tok->s, tok->sLen)); if (itemText) itemText.Set(str::Join(itemText, L" ", linkText)); else itemText.Set(linkText.StealData()); } else if (!tok->IsTag()) continue; else if (Tag_Mbp_Pagebreak == tok->tag) break; else if (!itemLink && tok->IsStartTag() && Tag_A == tok->tag) { AttrInfo *attr = tok->GetAttrByName("filepos"); if (!attr) attr = tok->GetAttrByName("href"); if (attr) itemLink.Set(str::conv::FromHtmlUtf8(attr->val, attr->valLen)); } else if (itemLink && tok->IsEndTag() && Tag_A == tok->tag) { PageDestination *dest = NULL; if (!itemText) { itemLink.Set(NULL); continue; } if (IsExternalUrl(itemLink)) dest = new SimpleDest2(0, RectD(), itemLink.StealData()); else dest = GetNamedDest(itemLink); EbookTocItem *item = new EbookTocItem(itemText.StealData(), dest); item->id = ++idCounter; item->open = itemLevel <= 2; AppendTocItem(root, item, itemLevel); itemLink.Set(NULL); } else if (Tag_Blockquote == tok->tag || Tag_Ul == tok->tag || Tag_Ol == tok->tag) { if (tok->IsStartTag()) itemLevel++; else if (tok->IsEndTag() && itemLevel > 0) itemLevel--; } } return root; }
static void Test02() { const char* s = "<p>Last paragraph"; HtmlPullParser parser(s, str::Len(s)); HtmlToken* t = parser.Next(); utassert(t && t->IsTag() && t->IsStartTag() && Tag_P == t->tag); t = parser.Next(); utassert(t && t->IsText() && str::EqNIx(t->s, t->sLen, "Last paragraph")); }
static void Test03() { const char* s = "a < b > c <> d <"; HtmlPullParser parser(s, str::Len(s)); HtmlToken* t = parser.Next(); utassert(t && t->IsText() && str::EqNIx(t->s, t->sLen, "a ")); t = parser.Next(); utassert(t && t->IsText() && str::EqNIx(t->s, t->sLen, "< b > c ")); t = parser.Next(); utassert(t && t->IsText() && str::EqNIx(t->s, t->sLen, "<> d ")); t = parser.Next(); utassert(t && t->IsError() && HtmlToken::UnclosedTag == t->error); t = parser.Next(); utassert(!t); }
// the name doesn't quite fit: this handles FB2 tags void Fb2Formatter::HandleHtmlTag(HtmlToken *t) { if (Tag_Title == t->tag || Tag_Subtitle == t->tag) { bool isSubtitle = Tag_Subtitle == t->tag; ScopedMem<char> name(str::Format("h%d", section + (isSubtitle ? 1 : 0))); HtmlToken tok; tok.SetTag(t->type, name, name + str::Len(name)); HandleTagHx(&tok); HandleAnchorAttr(t); if (!isSubtitle && t->IsStartTag()) { char *link = (char *)Allocator::Alloc(textAllocator, 24); sprintf_s(link, 24, FB2_TOC_ENTRY_MARK "%d", ++titleCount); currPage->instructions.Append(DrawInstr::Anchor(link, str::Len(link), RectF(0, currY, pageDx, 0))); } } else if (Tag_Section == t->tag) { if (t->IsStartTag()) section++; else if (t->IsEndTag() && section > 1) section--; FlushCurrLine(true); HandleAnchorAttr(t); } else if (Tag_P == t->tag) { if (!tagNesting.Contains(Tag_Title)) HtmlFormatter::HandleHtmlTag(t); } else if (Tag_Image == t->tag) { HandleTagImg(t); HandleAnchorAttr(t); } else if (Tag_A == t->tag) { HandleTagA(t, "href", "http://www.w3.org/1999/xlink"); HandleAnchorAttr(t, true); } else if (Tag_Pagebreak == t->tag) ForceNewPage(); else if (Tag_Strong == t->tag) HandleTagAsHtml(t, "b"); else if (t->NameIs("emphasis")) HandleTagAsHtml(t, "i"); else if (t->NameIs("epigraph")) HandleTagAsHtml(t, "blockquote"); else if (t->NameIs("empty-line")) { if (!t->IsEndTag()) EmitParagraph(0); } else if (t->NameIs("stylesheet")) HandleTagAsHtml(t, "style"); }
static WCHAR *ExtractHtmlText(EpubDoc *doc) { size_t len; const char *data = doc->GetTextData(&len); str::Str<char> text(len / 2); HtmlPullParser p(data, len); HtmlToken *t; Vec<HtmlTag> tagNesting; while ((t = p.Next()) != NULL && !t->IsError()) { if (t->IsText() && !tagNesting.Contains(Tag_Head) && !tagNesting.Contains(Tag_Script) && !tagNesting.Contains(Tag_Style)) { // trim whitespace (TODO: also normalize within text?) while (t->sLen > 0 && str::IsWs(t->s[0])) { t->s++; t->sLen--; } while (t->sLen > 0 && str::IsWs(t->s[t->sLen-1])) t->sLen--; if (t->sLen > 0) { text.AppendAndFree(ResolveHtmlEntities(t->s, t->sLen)); text.Append(' '); } } else if (t->IsStartTag()) { // TODO: force-close tags similar to HtmlFormatter.cpp's AutoCloseOnOpen? if (!IsTagSelfClosing(t->tag)) tagNesting.Append(t->tag); } else if (t->IsEndTag()) { if (!IsInlineTag(t->tag) && text.Size() > 0 && text.Last() == ' ') { text.Pop(); text.Append("\r\n"); } // when closing a tag, if the top tag doesn't match but // there are only potentially self-closing tags on the // stack between the matching tag, we pop all of them if (tagNesting.Contains(t->tag)) { while (tagNesting.Last() != t->tag) tagNesting.Pop(); } if (tagNesting.Count() > 0 && tagNesting.Last() == t->tag) tagNesting.Pop(); } } return str::conv::FromUtf8(text.Get()); }
static void Test00(const char *s, HtmlToken::TokenType expectedType) { HtmlPullParser parser(s, str::Len(s)); HtmlToken *t = parser.Next(); assert(t->type == expectedType); assert(t->NameIs("p")); assert(Tag_P == t->tag); AttrInfo *a = t->GetAttrByName("a1"); assert(a->NameIs("a1")); assert(a->ValIs(">")); a = t->GetAttrByName("foo"); assert(a->NameIs("foo")); assert(a->ValIs("bar")); a = t->GetAttrByName("nope"); assert(!a); t = parser.Next(); assert(!t); }
// Parse s in place i.e. we assume we can modify it. Must be 0-terminated. // The caller owns the memory for s. HtmlElement *HtmlParser::ParseInPlace(char *s, UINT codepage) { if (this->html) Reset(); this->html = s; this->codepage = codepage; HtmlPullParser parser(s, strlen(s)); HtmlToken *tok; while ((tok = parser.Next())) { char *tag = (char *)tok->s; if (tok->IsError()) { errorContext = tag; switch (tok->error) { case HtmlToken::UnclosedTag: return ParseError(ErrParsingElementName); case HtmlToken::InvalidTag: return ParseError(ErrParsingClosingElement); default: return ParseError(ErrParsingElement); } } if (!tok->IsTag()) { // ignore text content assert(tok->IsText()); continue; } char *tagEnd = tag + tok->nLen; if (!tok->IsEndTag()) { // note: call tok->NextAttr() before zero-terminating names and values AttrInfo *attr = tok->NextAttr(); *tagEnd = '\0'; StartTag(tag); while (attr) { char *name = (char *)attr->name; char *nameEnd = name + attr->nameLen; char *value = (char *)attr->val; char *valueEnd = value + attr->valLen; attr = tok->NextAttr(); *nameEnd = *valueEnd = '\0'; AppendAttr(name, value); } } if (!tok->IsStartTag() || IsTagSelfClosing(tok->tag)) { *tagEnd = '\0'; CloseTag(tag); } } return rootElement; }
// Return the next parsed page. Returns NULL if finished parsing. // For simplicity of implementation, we parse xml text node or // xml element at a time. This might cause a creation of one // or more pages, which we remeber and send to the caller // if we detect accumulated pages. HtmlPage *HtmlFormatter::Next(bool skipEmptyPages) { for (;;) { // send out all pages accumulated so far while (pagesToSend.Count() > 0) { HtmlPage *ret = pagesToSend.At(0); pagesToSend.RemoveAt(0); pageCount++; if (skipEmptyPages && IsEmptyPage(ret)) delete ret; else return ret; } // we can call ourselves recursively to send outstanding // pages after parsing has finished so this is to detect // that case and really end parsing if (finishedParsing) return NULL; HtmlToken *t = htmlParser->Next(); if (!t || t->IsError()) break; currReparseIdx = t->GetReparsePoint() - htmlParser->Start(); CrashIf(!ValidReparseIdx(currReparseIdx, htmlParser)); if (t->IsTag()) HandleHtmlTag(t); else if (!IgnoreText()) HandleText(t); } // force layout of the last line AutoCloseTags(tagNesting.Count()); FlushCurrLine(true); UpdateLinkBboxes(currPage); pagesToSend.Append(currPage); currPage = NULL; // call ourselves recursively to return accumulated pages finishedParsing = true; return Next(); }
// extract ComicInfo.xml metadata // cf. http://comicrack.cyolito.com/downloads/comicrack/ComicRack/Support-Files/ComicInfoSchema.zip/ void CbxEngineImpl::ParseComicInfoXml(const char *xmlData) { PoolAllocator allocator; HtmlPullParser parser(xmlData, str::Len(xmlData)); HtmlToken *tok; while ((tok = parser.Next()) && !tok->IsError()) { if (!tok->IsStartTag()) continue; if (tok->NameIs("Title")) { ScopedMem<char> value(GetTextContent(parser)); if (value) Visit("/ComicBookInfo/1.0/title", value, json::Type_String); } else if (tok->NameIs("Year")) { ScopedMem<char> value(GetTextContent(parser)); if (value) Visit("/ComicBookInfo/1.0/publicationYear", value, json::Type_Number); } else if (tok->NameIs("Month")) { ScopedMem<char> value(GetTextContent(parser)); if (value) Visit("/ComicBookInfo/1.0/publicationMonth", value, json::Type_Number); } else if (tok->NameIs("Summary")) { ScopedMem<char> value(GetTextContent(parser)); if (value) Visit("/X-summary", value, json::Type_String); } else if (tok->NameIs("Writer")) { ScopedMem<char> value(GetTextContent(parser)); if (value) { Visit("/ComicBookInfo/1.0/credits[0]/person", value, json::Type_String); Visit("/ComicBookInfo/1.0/credits[0]/primary", "true", json::Type_Bool); } } else if (tok->NameIs("Penciller")) { ScopedMem<char> value(GetTextContent(parser)); if (value) { Visit("/ComicBookInfo/1.0/credits[1]/person", value, json::Type_String); Visit("/ComicBookInfo/1.0/credits[1]/primary", "true", json::Type_Bool); } } } }
// cf. http://www.w3.org/TR/html4/charset.html#h-5.2.2 static UINT ExtractHttpCharset(const char *html, size_t htmlLen) { if (!strstr(html, "charset=")) return 0; HtmlPullParser parser(html, min(htmlLen, 1024)); HtmlToken *tok; while ((tok = parser.Next()) && !tok->IsError()) { if (tok->tag != Tag_Meta) continue; AttrInfo *attr = tok->GetAttrByName("http-equiv"); if (!attr || !attr->ValIs("Content-Type")) continue; attr = tok->GetAttrByName("content"); ScopedMem<char> mimetype, charset; if (!attr || !str::Parse(attr->val, attr->valLen, "%S;%_charset=%S", &mimetype, &charset)) continue; static struct { const char *name; UINT codepage; } codepages[] = { { "ISO-8859-1", 1252 }, { "Latin1", 1252 }, { "CP1252", 1252 }, { "Windows-1252", 1252 }, { "ISO-8859-2", 28592 }, { "Latin2", 28592 }, { "CP1251", 1251 }, { "Windows-1251", 1251 }, { "KOI8-R", 20866 }, { "shift-jis", 932 }, { "x-euc", 932 }, { "euc-kr", 949 }, { "Big5", 950 }, { "GB2312", 936 }, { "UTF-8", CP_UTF8 }, }; for (int i = 0; i < dimof(codepages); i++) { if (str::EqI(charset, codepages[i].name)) return codepages[i].codepage; } break; } return 0; }
DocTocItem *Fb2EngineImpl::GetTocTree() { EbookTocItem *root = NULL; ScopedMem<WCHAR> itemText; int titleCount = 0; bool inTitle = false; int level = 0; size_t xmlLen; const char *xmlData = doc->GetTextData(&xmlLen); HtmlPullParser parser(xmlData, xmlLen); HtmlToken *tok; while ((tok = parser.Next()) && !tok->IsError()) { if (tok->IsStartTag() && Tag_Section == tok->tag) level++; else if (tok->IsEndTag() && Tag_Section == tok->tag && level > 0) level--; else if (tok->IsStartTag() && Tag_Title == tok->tag) { inTitle = true; titleCount++; } else if (tok->IsEndTag() && Tag_Title == tok->tag) { if (itemText) str::NormalizeWS(itemText); if (!str::IsEmpty(itemText.Get())) { ScopedMem<WCHAR> name(str::Format(TEXT(FB2_TOC_ENTRY_MARK) L"%d", titleCount)); PageDestination *dest = GetNamedDest(name); EbookTocItem *item = new EbookTocItem(itemText.StealData(), dest); item->id = titleCount; item->open = level <= 2; AppendTocItem(root, item, level); } inTitle = false; } else if (inTitle && tok->IsText()) { ScopedMem<WCHAR> text(str::conv::FromHtmlUtf8(tok->s, tok->sLen)); if (str::IsEmpty(itemText.Get())) itemText.Set(text.StealData()); else itemText.Set(str::Join(itemText, L" ", text)); } } return root; }
void EbookController::ExtractPageAnchors() { if (pageAnchorIds || pageAnchorIdxs) { CrashIf(!pageAnchorIds || !pageAnchorIdxs); return; } pageAnchorIds = new WStrVec(); pageAnchorIdxs = new Vec<int>(); ScopedMem<WCHAR> epubPagePath; int fb2TitleCount = 0; size_t len; const char *data = doc.GetHtmlData(len); HtmlPullParser parser(data, len); HtmlToken *tok; while ((tok = parser.Next()) != nullptr && !tok->IsError()) { if (!tok->IsStartTag() && !tok->IsEmptyElementEndTag()) continue; AttrInfo *attr = tok->GetAttrByName("id"); if (!attr && Tag_A == tok->tag && doc.Type() != Doc_Fb2) attr = tok->GetAttrByName("name"); if (attr) { ScopedMem<WCHAR> id(str::conv::FromUtf8(attr->val, attr->valLen)); pageAnchorIds->Append(str::Format(L"%s#%s", epubPagePath ? epubPagePath : L"", id.Get())); pageAnchorIdxs->Append((int)(tok->GetReparsePoint() - parser.Start())); } // update EPUB page paths and create an anchor per chapter if (Tag_Pagebreak == tok->tag && (attr = tok->GetAttrByName("page_path")) != nullptr && str::StartsWith(attr->val + attr->valLen, "\" page_marker />")) { CrashIf(doc.Type() != Doc_Epub); epubPagePath.Set(str::conv::FromUtf8(attr->val, attr->valLen)); pageAnchorIds->Append(str::Dup(epubPagePath)); pageAnchorIdxs->Append((int)(tok->GetReparsePoint() - parser.Start())); } // create FB2 title anchors (cf. Fb2Doc::ParseToc) if (Tag_Title == tok->tag && tok->IsStartTag() && Doc_Fb2 == doc.Type()) { ScopedMem<WCHAR> id(str::Format(TEXT(FB2_TOC_ENTRY_MARK) L"%d", ++fb2TitleCount)); pageAnchorIds->Append(id.StealData()); pageAnchorIdxs->Append((int)(tok->GetReparsePoint() - parser.Start())); } } }
void Fb2Formatter::HandleTagAsHtml(HtmlToken *t, const char *name) { HtmlToken tok; tok.SetTag(t->type, name, name + str::Len(name)); HtmlFormatter::HandleHtmlTag(&tok); }