HtmlElement *HtmlParser::FindElementByNameNS(const char *name, const char *ns, HtmlElement *from) { HtmlElement *el = from ? from : rootElement; if (from) goto FindNext; if (!el) return NULL; CheckNext: if (el->NameIs(name) || ns && el->NameIsNS(name, ns)) return el; FindNext: if (el->down) { el = el->down; goto CheckNext; } if (el->next) { el = el->next; goto CheckNext; } // backup in the tree HtmlElement *parent = el->up; while (parent) { if (parent->next) { el = parent->next; goto CheckNext; } parent = parent->up; } return NULL; }
static void HtmlParser02() { HtmlParser p; HtmlElement *root = p.Parse("<a><b/><c></c ><d at1=\"<quo&ted>\" at2='also quoted' att3=notquoted att4=end/></a>"); assert(4 == p.ElementsCount()); assert(4 == p.TotalAttrCount()); assert(str::Eq("a", root->name)); assert(NULL == root->next); HtmlElement *el = root->down; assert(str::Eq("b", el->name)); assert(root == el->up); el = el->next; assert(str::Eq("c", el->name)); assert(root == el->up); el = el->next; assert(str::Eq("d", el->name)); assert(NULL == el->next); assert(root == el->up); ScopedMem<TCHAR> val(el->GetAttribute("at1")); assert(str::Eq(val, _T("<quo&ted>"))); val.Set(el->GetAttribute("at2")); assert(str::Eq(val, _T("also quoted"))); val.Set(el->GetAttribute("att3")); assert(str::Eq(val, _T("notquoted"))); val.Set(el->GetAttribute("att4")); assert(str::Eq(val, _T("end"))); }
static void HtmlParser05() { HtmlParser p; HtmlElement *root = p.Parse("<!doctype><html><HEAD><meta name=foo></head><body><object t=la><param name=foo val=bar></object><ul><li></ul></object></body></Html>"); utassert(8 == p.ElementsCount()); utassert(4 == p.TotalAttrCount()); utassert(root->NameIs("html")); utassert(NULL == root->up); utassert(NULL == root->next); HtmlElement *el = root->down; utassert(el->NameIs("head")); HtmlElement *el2 = el->down; utassert(el2->NameIs("meta")); utassert(NULL == el2->next); utassert(NULL == el2->down); el2 = el->next; utassert(el2->NameIs("body")); utassert(NULL == el2->next); el2 = el2->down; utassert(el2->NameIs("object")); el = p.FindElementByName("html"); utassert(el); el = p.FindElementByName("head", el); utassert(el); utassert(el->NameIs("head")); el = p.FindElementByName("ul", el); utassert(el); }
static void WalkChmTocOrIndex(EbookTocVisitor *visitor, HtmlElement *list, UINT cp, bool isIndex, int level=1) { CrashIf(Tag_Ul != list->tag); // some broken ToCs wrap every <li> into its own <ul> for (; list && Tag_Ul == list->tag; list = list->next) { for (HtmlElement *el = list->down; el; el = el->next) { if (Tag_Li != el->tag) continue; // ignore unexpected elements bool valid; HtmlElement *elObj = el->GetChildByTag(Tag_Object); if (!elObj) valid = false; else if (isIndex) valid = VisitChmIndexItem(visitor, elObj, cp, level); else valid = VisitChmTocItem(visitor, elObj, cp, level); if (!valid) continue; // skip incomplete elements and all their children HtmlElement *nested = el->GetChildByTag(Tag_Ul); // some broken ToCs have the <ul> follow right *after* a <li> if (!nested && el->next && Tag_Ul == el->next->tag) nested = el->next; if (nested) WalkChmTocOrIndex(visitor, nested, cp, isIndex, level + 1); } } }
static void HtmlParser07() { HtmlParser p; HtmlElement *root = p.Parse("<test umls=ä\xC3\xB6ü Zero=�&#-1;>", CP_UTF8); utassert(1 == p.ElementsCount()); ScopedMem<WCHAR> val(root->GetAttribute("umls")); utassert(str::Eq(val, L"\xE4\xF6\xFC")); val.Set(root->GetAttribute("zerO")); utassert(str::Eq(val, L"\x01??")); }
HtmlElement *HtmlElement::GetChildByName(const char *name, int idx) const { for (HtmlElement *el = down; el; el = el->next) { if (el->NameIs(name)) { if (0 == idx) return el; idx--; } } return NULL; }
static void HtmlParser11() { HtmlParser p; HtmlElement *root = p.Parse("<root/><!-- comment -->"); utassert(1 == p.ElementsCount()); utassert(0 == p.TotalAttrCount()); utassert(root && root->NameIs("root")); root = p.Parse("<root><!---></root>"); utassert(!root); }
HtmlElement *HtmlParser::FindParent(char *tagName) { if (str::Eq(tagName, "li")) { // make a list item the child of the closest list for (HtmlElement *el = currElement; el; el = el->up) { if (el->NameIs("ul") || el->NameIs("ol")) return el; } } return currElement; }
void HtmlParser::CloseTag(char *tagName) { str::ToLower(tagName); // to allow for lack of closing tags, e.g. in case like // <a><b><c></a>, we look for the first parent with matching name for (HtmlElement *el = currElement; el; el = el->up) { if (el->NameIs(tagName)) { currElement = el->up; return; } } // ignore the unexpected closing tag }
static void HtmlParser03() { HtmlParser p; HtmlElement *root = p.Parse("<el att =v"al/>"); assert(1 == p.ElementsCount()); assert(1 == p.TotalAttrCount()); assert(str::Eq("el", root->name)); assert(NULL == root->next); assert(NULL == root->up); assert(NULL == root->down); ScopedMem<TCHAR> val(root->GetAttribute("att")); assert(str::Eq(val, _T("v\"al"))); assert(!root->firstAttr->next); }
static void HtmlParser04() { HtmlParser p; HtmlElement *root = p.Parse("<el att= va'l></ el >"); utassert(1 == p.ElementsCount()); utassert(1 == p.TotalAttrCount()); utassert(root->NameIs("el")); utassert(NULL == root->next); utassert(NULL == root->up); utassert(NULL == root->down); ScopedMem<WCHAR> val(root->GetAttribute("att")); utassert(str::Eq(val, L"va'l")); utassert(!root->firstAttr->next); }
static void HtmlParserFile() { TCHAR *fileName = _T("HtmlParseTest00.html"); // We assume we're being run from obj-[dbg|rel], so the test // files are in ..\src\utils directory relative to exe's dir ScopedMem<TCHAR> exePath(GetExePath()); const TCHAR *exeDir = path::GetBaseName(exePath); ScopedMem<TCHAR> p1(path::Join(exeDir, _T("..\\src\\utils"))); ScopedMem<TCHAR> p2(path::Join(p1, fileName)); char *d = file::ReadAll(p2, NULL); // it's ok if we fail - we assume we were not run from the // right location if (!d) return; HtmlParser p; HtmlElement *root = p.ParseInPlace(d); assert(root); assert(709 == p.ElementsCount()); assert(955 == p.TotalAttrCount()); assert(str::Eq(root->name, "html")); HtmlElement *el = root->down; assert(str::Eq(el->name, "head")); el = el->next; assert(str::Eq(el->name, "body")); el = el->down; assert(str::Eq(el->name, "object")); el = el->next; assert(str::Eq(el->name, "ul")); el = el->down; assert(str::Eq(el->name, "li")); el = el->down; assert(str::Eq(el->name, "object")); ScopedMem<TCHAR> val(el->GetAttribute("type")); assert(str::Eq(val, _T("text/sitemap"))); el = el->down; assert(str::Eq(el->name, "param")); assert(!el->down); assert(str::Eq(el->next->name, "param")); el = p.FindElementByName("body"); assert(el); el = p.FindElementByName("ul", el); assert(el); int count = 0; while (el) { ++count; el = p.FindElementByName("ul", el); } assert(18 == count); free(d); }
static void HtmlParser07() { HtmlParser p; HtmlElement *root = p.Parse("<test umls=ä\xC3\xB6ü zero=�&#-1;>", CP_UTF8); assert(1 == p.ElementsCount()); ScopedMem<TCHAR> val(root->GetAttribute("umls")); #ifdef UNICODE assert(str::Eq(val, L"\xE4\xF6\xFC")); #else assert(str::EndsWith(val, "\xFC")); #endif val.Set(root->GetAttribute("zero")); assert(str::Eq(val, _T("\x01??"))); }
static void HtmlParser00() { HtmlParser p; HtmlElement *root = p.Parse("<a></A>"); utassert(p.ElementsCount() == 1); utassert(root); utassert(Tag_A == root->tag && !root->name); utassert(root->NameIs("a")); root = p.Parse("<b></B>"); utassert(p.ElementsCount() == 1); utassert(root); utassert(Tag_B == root->tag && !root->name); utassert(root->NameIs("b")); }
static void HtmlParser09() { HtmlParser p; HtmlElement *root = p.Parse("<?xml version='1.0'?><!-- <html><body></html> --><root attr='<!-- comment -->' />"); assert(1 == p.ElementsCount()); assert(1 == p.TotalAttrCount()); assert(str::Eq("root", root->name)); ScopedMem<TCHAR> val(root->GetAttribute("attr")); assert(str::Eq(val, _T("<!-- comment -->"))); root = p.Parse("<!-- comment with \" and \' --><main />"); assert(1 == p.ElementsCount()); assert(0 == p.TotalAttrCount()); assert(str::Eq("main", root->name)); }
// ignores any <ul><li> list structure and just extracts a linear list of <object type="text/sitemap">...</object> static bool WalkBrokenChmTocOrIndex(EbookTocVisitor* visitor, HtmlParser& p, UINT cp, bool isIndex) { bool hadOne = false; HtmlElement* el = p.FindElementByName("body"); while ((el = p.FindElementByName("object", el)) != nullptr) { AutoFreeW type(el->GetAttribute("type")); if (!str::EqI(type, L"text/sitemap")) continue; if (isIndex) hadOne |= VisitChmIndexItem(visitor, el, cp, 1); else hadOne |= VisitChmTocItem(visitor, el, cp, 1); } return hadOne; }
static void HtmlParser01() { HtmlParser p; HtmlElement *root = p.Parse("<A><bAh></a>"); utassert(p.ElementsCount() == 2); utassert(Tag_A == root->tag && !root->name); utassert(NULL == root->up); utassert(NULL == root->next); HtmlElement *el = root->down; utassert(NULL == el->firstAttr); utassert(el->NameIs("bah") && el->NameIs("BAH")); utassert(Tag_NotFound == el->tag && str::Eq("bAh", el->name)); utassert(el->up == root); utassert(NULL == el->down); utassert(NULL == el->next); }
void HtmlParser::CloseTag(HtmlToken *tok) { char *tagName = NULL; if (Tag_NotFound == tok->tag) { tagName = (char *)tok->s; char *tagEnd = tagName + tok->nLen; *tagEnd = '\0'; } // to allow for lack of closing tags, e.g. in case like // <a><b><c></a>, we look for the first parent with matching name for (HtmlElement *el = currElement; el; el = el->up) { if (tagName ? el->NameIs(tagName) : tok->tag == el->tag) { currElement = el->up; return; } } // ignore the unexpected closing tag }
static void HtmlParser06() { HtmlParser p; HtmlElement *root = p.Parse("<ul><p>ignore<li><br><meta><li><ol><li></ul><dropme>"); assert(9 == p.ElementsCount()); assert(0 == p.TotalAttrCount()); assert(str::Eq("ul", root->name)); assert(!root->next); HtmlElement *el = root->GetChildByName("li"); assert(el); assert(str::Eq(el->down->name, "br")); assert(str::Eq(el->down->next->name, "meta")); assert(!el->down->next->next); el = root->GetChildByName("li", 1); assert(el); assert(!el->next); el = el->GetChildByName("ol"); assert(!el->next); assert(str::Eq(el->down->name, "li")); assert(!el->down->down); }
bool MobiEngineImpl::Load(const WCHAR *fileName) { this->fileName = str::Dup(fileName); doc = MobiDoc::CreateFromFile(fileName); if (!doc || Pdb_Mobipocket != doc->GetDocType()) return false; HtmlFormatterArgs args; args.htmlStr = doc->GetBookHtmlData(args.htmlStrLen); args.pageDx = (float)pageRect.dx - 2 * pageBorder; args.pageDy = (float)pageRect.dy - 2 * pageBorder; args.fontName = DEFAULT_FONT_NAME; args.fontSize = DEFAULT_FONT_SIZE; args.textAllocator = &allocator; args.measureAlgo = MeasureTextQuick; pages = MobiFormatter(&args, doc).FormatAllPages(); if (!ExtractPageAnchors()) return false; HtmlParser parser; if (parser.Parse(args.htmlStr)) { HtmlElement *ref = NULL; while ((ref = parser.FindElementByName("reference", ref))) { ScopedMem<WCHAR> type(ref->GetAttribute("type")); ScopedMem<WCHAR> filepos(ref->GetAttribute("filepos")); if (str::EqI(type, L"toc") && filepos) { unsigned int pos; if (str::Parse(filepos, L"%u%$", &pos) && pos < args.htmlStrLen) { tocReparsePoint = args.htmlStr + pos; break; } } } } return pages->Count() > 0; }
static void WalkChmTocOrIndex(EbookTocVisitor *visitor, HtmlElement *list, UINT cp, bool isIndex, int level=1) { CrashIf(!list->NameIs("ul")); // some broken ToCs wrap every <li> into its own <ul> for (; list && list->NameIs("ul"); list = list->next) { for (HtmlElement *el = list->down; el; el = el->next) { if (!el->NameIs("li")) continue; // ignore unexpected elements bool valid = (isIndex ? VisitChmIndexItem : VisitChmTocItem)(visitor, el, cp, level); if (!valid) continue; // skip incomplete elements and all their children HtmlElement *nested = el->GetChildByName("ul"); // some broken ToCs have the <ul> follow right *after* a <li> if (!nested && el->next && el->next->NameIs("ul")) nested = el->next; if (nested) WalkChmTocOrIndex(visitor, nested, cp, isIndex, level + 1); } } }
static void HtmlParser10() { HtmlParser p; HtmlElement *root = p.Parse("<!xml version='1.0'?><x:a xmlns:x='http://example.org/ns/x'><x:b attr='val'/></x:a>"); utassert(2 == p.ElementsCount()); utassert(2 == p.TotalAttrCount()); utassert(root->NameIs("x:a") && root->NameIsNS("a", "http://example.org/ns/x")); HtmlElement *node = p.FindElementByName("b"); utassert(!node); node = p.FindElementByNameNS("b", "http://example.org/ns/x"); utassert(node); utassert(node->NameIs("x:b") && node->NameIsNS("b", "http://example.org/ns/x")); ScopedMem<WCHAR> val(node->GetAttribute("attr")); utassert(str::Eq(val, L"val")); // TODO: XML tags are case sensitive (HTML tags aren't) node = p.FindElementByName("X:B"); utassert(node && node->NameIs("X:B")); }
static void HtmlParser06() { HtmlParser p; HtmlElement *root = p.Parse("<ul><p>ignore<li><br><meta><li><ol><li></ul><dropme>"); utassert(9 == p.ElementsCount()); utassert(0 == p.TotalAttrCount()); utassert(root->NameIs("ul")); utassert(!root->next); HtmlElement *el = root->GetChildByTag(Tag_Li); utassert(el); utassert(el->down->NameIs("br")); utassert(el->down->next->NameIs("meta")); utassert(!el->down->next->next); el = root->GetChildByTag(Tag_Li, 1); utassert(el); utassert(!el->next); el = el->GetChildByTag(Tag_Ol); utassert(!el->next); utassert(el->down->NameIs("li")); utassert(!el->down->down); }