DocTocItem *MobiEngineImpl::GetTocTree() { if (!tocReparsePoint) return NULL; EbookTocItem *root = NULL; ScopedMem<WCHAR> itemText; ScopedMem<WCHAR> itemLink; int itemLevel = 0; int idCounter = 0; // there doesn't seem to be a standard for Mobi ToCs, so we try to // determine the author's intentions by looking at commonly used tags HtmlPullParser parser(tocReparsePoint, str::Len(tocReparsePoint)); HtmlToken *tok; while ((tok = parser.Next()) && !tok->IsError()) { if (itemLink && tok->IsText()) { ScopedMem<WCHAR> linkText(str::conv::FromHtmlUtf8(tok->s, tok->sLen)); if (itemText) itemText.Set(str::Join(itemText, L" ", linkText)); else itemText.Set(linkText.StealData()); } else if (!tok->IsTag()) continue; else if (Tag_Mbp_Pagebreak == tok->tag) break; else if (!itemLink && tok->IsStartTag() && Tag_A == tok->tag) { AttrInfo *attr = tok->GetAttrByName("filepos"); if (!attr) attr = tok->GetAttrByName("href"); if (attr) itemLink.Set(str::conv::FromHtmlUtf8(attr->val, attr->valLen)); } else if (itemLink && tok->IsEndTag() && Tag_A == tok->tag) { PageDestination *dest = NULL; if (!itemText) { itemLink.Set(NULL); continue; } if (IsExternalUrl(itemLink)) dest = new SimpleDest2(0, RectD(), itemLink.StealData()); else dest = GetNamedDest(itemLink); EbookTocItem *item = new EbookTocItem(itemText.StealData(), dest); item->id = ++idCounter; item->open = itemLevel <= 2; AppendTocItem(root, item, itemLevel); itemLink.Set(NULL); } else if (Tag_Blockquote == tok->tag || Tag_Ul == tok->tag || Tag_Ol == tok->tag) { if (tok->IsStartTag()) itemLevel++; else if (tok->IsEndTag() && itemLevel > 0) itemLevel--; } } return root; }
void EbookController::ExtractPageAnchors() { if (pageAnchorIds || pageAnchorIdxs) { CrashIf(!pageAnchorIds || !pageAnchorIdxs); return; } pageAnchorIds = new WStrVec(); pageAnchorIdxs = new Vec<int>(); ScopedMem<WCHAR> epubPagePath; int fb2TitleCount = 0; size_t len; const char *data = doc.GetHtmlData(len); HtmlPullParser parser(data, len); HtmlToken *tok; while ((tok = parser.Next()) != nullptr && !tok->IsError()) { if (!tok->IsStartTag() && !tok->IsEmptyElementEndTag()) continue; AttrInfo *attr = tok->GetAttrByName("id"); if (!attr && Tag_A == tok->tag && doc.Type() != Doc_Fb2) attr = tok->GetAttrByName("name"); if (attr) { ScopedMem<WCHAR> id(str::conv::FromUtf8(attr->val, attr->valLen)); pageAnchorIds->Append(str::Format(L"%s#%s", epubPagePath ? epubPagePath : L"", id.Get())); pageAnchorIdxs->Append((int)(tok->GetReparsePoint() - parser.Start())); } // update EPUB page paths and create an anchor per chapter if (Tag_Pagebreak == tok->tag && (attr = tok->GetAttrByName("page_path")) != nullptr && str::StartsWith(attr->val + attr->valLen, "\" page_marker />")) { CrashIf(doc.Type() != Doc_Epub); epubPagePath.Set(str::conv::FromUtf8(attr->val, attr->valLen)); pageAnchorIds->Append(str::Dup(epubPagePath)); pageAnchorIdxs->Append((int)(tok->GetReparsePoint() - parser.Start())); } // create FB2 title anchors (cf. Fb2Doc::ParseToc) if (Tag_Title == tok->tag && tok->IsStartTag() && Doc_Fb2 == doc.Type()) { ScopedMem<WCHAR> id(str::Format(TEXT(FB2_TOC_ENTRY_MARK) L"%d", ++fb2TitleCount)); pageAnchorIds->Append(id.StealData()); pageAnchorIdxs->Append((int)(tok->GetReparsePoint() - parser.Start())); } } }
static void Test00(const char *s, HtmlToken::TokenType expectedType) { HtmlPullParser parser(s, str::Len(s)); HtmlToken *t = parser.Next(); assert(t->type == expectedType); assert(t->NameIs("p")); assert(Tag_P == t->tag); AttrInfo *a = t->GetAttrByName("a1"); assert(a->NameIs("a1")); assert(a->ValIs(">")); a = t->GetAttrByName("foo"); assert(a->NameIs("foo")); assert(a->ValIs("bar")); a = t->GetAttrByName("nope"); assert(!a); t = parser.Next(); assert(!t); }
// cf. http://www.w3.org/TR/html4/charset.html#h-5.2.2 static UINT ExtractHttpCharset(const char *html, size_t htmlLen) { if (!strstr(html, "charset=")) return 0; HtmlPullParser parser(html, min(htmlLen, 1024)); HtmlToken *tok; while ((tok = parser.Next()) && !tok->IsError()) { if (tok->tag != Tag_Meta) continue; AttrInfo *attr = tok->GetAttrByName("http-equiv"); if (!attr || !attr->ValIs("Content-Type")) continue; attr = tok->GetAttrByName("content"); ScopedMem<char> mimetype, charset; if (!attr || !str::Parse(attr->val, attr->valLen, "%S;%_charset=%S", &mimetype, &charset)) continue; static struct { const char *name; UINT codepage; } codepages[] = { { "ISO-8859-1", 1252 }, { "Latin1", 1252 }, { "CP1252", 1252 }, { "Windows-1252", 1252 }, { "ISO-8859-2", 28592 }, { "Latin2", 28592 }, { "CP1251", 1251 }, { "Windows-1251", 1251 }, { "KOI8-R", 20866 }, { "shift-jis", 932 }, { "x-euc", 932 }, { "euc-kr", 949 }, { "Big5", 950 }, { "GB2312", 936 }, { "UTF-8", CP_UTF8 }, }; for (int i = 0; i < dimof(codepages); i++) { if (str::EqI(charset, codepages[i].name)) return codepages[i].codepage; } break; } return 0; }