static void WalkChmTocOrIndex(EbookTocVisitor *visitor, HtmlElement *list, UINT cp, bool isIndex, int level=1) { CrashIf(Tag_Ul != list->tag); // some broken ToCs wrap every <li> into its own <ul> for (; list && Tag_Ul == list->tag; list = list->next) { for (HtmlElement *el = list->down; el; el = el->next) { if (Tag_Li != el->tag) continue; // ignore unexpected elements bool valid; HtmlElement *elObj = el->GetChildByTag(Tag_Object); if (!elObj) valid = false; else if (isIndex) valid = VisitChmIndexItem(visitor, elObj, cp, level); else valid = VisitChmTocItem(visitor, elObj, cp, level); if (!valid) continue; // skip incomplete elements and all their children HtmlElement *nested = el->GetChildByTag(Tag_Ul); // some broken ToCs have the <ul> follow right *after* a <li> if (!nested && el->next && Tag_Ul == el->next->tag) nested = el->next; if (nested) WalkChmTocOrIndex(visitor, nested, cp, isIndex, level + 1); } } }
bool ChmDoc::ParseTocOrIndex(EbookTocVisitor *visitor, const char *path, bool isIndex) { if (!path) return false; // TODO: is path already UTF-8 encoded - or do we need str::conv::ToUtf8(ToStr(path)) ? ScopedMem<unsigned char> htmlData(GetData(path, NULL)); const char *html = (char *)htmlData.Get(); if (!html) return false; HtmlParser p; UINT cp = codepage; // detect UTF-8 content by BOM if (str::StartsWith(html, UTF8_BOM)) { html += 3; cp = CP_UTF8; } // enforce the default codepage, so that pre-encoded text and // entities are in the same codepage and VisitChmTocItem yields // consistent results HtmlElement *el = p.Parse(html, CP_CHM_DEFAULT); if (!el) return false; el = p.FindElementByName("body"); // since <body> is optional, also continue without one el = p.FindElementByName("ul", el); if (!el) return WalkBrokenChmTocOrIndex(visitor, p, cp, isIndex); WalkChmTocOrIndex(visitor, el, cp, isIndex); return true; }
static void WalkChmTocOrIndex(EbookTocVisitor *visitor, HtmlElement *list, UINT cp, bool isIndex, int level=1) { CrashIf(!list->NameIs("ul")); // some broken ToCs wrap every <li> into its own <ul> for (; list && list->NameIs("ul"); list = list->next) { for (HtmlElement *el = list->down; el; el = el->next) { if (!el->NameIs("li")) continue; // ignore unexpected elements bool valid = (isIndex ? VisitChmIndexItem : VisitChmTocItem)(visitor, el, cp, level); if (!valid) continue; // skip incomplete elements and all their children HtmlElement *nested = el->GetChildByName("ul"); // some broken ToCs have the <ul> follow right *after* a <li> if (!nested && el->next && el->next->NameIs("ul")) nested = el->next; if (nested) WalkChmTocOrIndex(visitor, nested, cp, isIndex, level + 1); } } }