static void HtmlParser02() { HtmlParser p; HtmlElement *root = p.Parse("<a><b/><c></c ><d at1=\"<quo&ted>\" at2='also quoted' att3=notquoted att4=end/></a>"); utassert(4 == p.ElementsCount()); utassert(4 == p.TotalAttrCount()); utassert(root->NameIs("a")); utassert(NULL == root->next); HtmlElement *el = root->down; utassert(el->NameIs("b")); utassert(root == el->up); el = el->next; utassert(el->NameIs("c")); utassert(root == el->up); el = el->next; utassert(el->NameIs("d")); utassert(NULL == el->next); utassert(root == el->up); ScopedMem<WCHAR> val(el->GetAttribute("at1")); utassert(str::Eq(val, L"<quo&ted>")); val.Set(el->GetAttribute("at2")); utassert(str::Eq(val, L"also quoted")); val.Set(el->GetAttribute("att3")); utassert(str::Eq(val, L"notquoted")); val.Set(el->GetAttribute("att4")); utassert(str::Eq(val, L"end")); }
static void HtmlParser05() { HtmlParser p; HtmlElement *root = p.Parse("<!doctype><html><HEAD><meta name=foo></head><body><object t=la><param name=foo val=bar></object><ul><li></ul></object></body></Html>"); utassert(8 == p.ElementsCount()); utassert(4 == p.TotalAttrCount()); utassert(root->NameIs("html")); utassert(NULL == root->up); utassert(NULL == root->next); HtmlElement *el = root->down; utassert(el->NameIs("head")); HtmlElement *el2 = el->down; utassert(el2->NameIs("meta")); utassert(NULL == el2->next); utassert(NULL == el2->down); el2 = el->next; utassert(el2->NameIs("body")); utassert(NULL == el2->next); el2 = el2->down; utassert(el2->NameIs("object")); el = p.FindElementByName("html"); utassert(el); el = p.FindElementByName("head", el); utassert(el); utassert(el->NameIs("head")); el = p.FindElementByName("ul", el); utassert(el); }
HtmlElement *HtmlParser::FindParent(char *tagName) { if (str::Eq(tagName, "li")) { // make a list item the child of the closest list for (HtmlElement *el = currElement; el; el = el->up) { if (el->NameIs("ul") || el->NameIs("ol")) return el; } } return currElement; }
static void HtmlParser09() { HtmlParser p; HtmlElement *root = p.Parse("<?xml version='1.0'?><!-- <html><body></html> --><root attr='<!-- comment -->' />"); utassert(1 == p.ElementsCount()); utassert(1 == p.TotalAttrCount()); utassert(root->NameIs("root")); ScopedMem<WCHAR> val(root->GetAttribute("attr")); utassert(str::Eq(val, L"<!-- comment -->")); root = p.Parse("<!-- comment with \" and \' --><main />"); utassert(1 == p.ElementsCount()); utassert(0 == p.TotalAttrCount()); utassert(root->NameIs("main")); }
static void HtmlParser00() { HtmlParser p; HtmlElement *root = p.Parse("<a></A>"); utassert(p.ElementsCount() == 1); utassert(root); utassert(Tag_A == root->tag && !root->name); utassert(root->NameIs("a")); root = p.Parse("<b></B>"); utassert(p.ElementsCount() == 1); utassert(root); utassert(Tag_B == root->tag && !root->name); utassert(root->NameIs("b")); }
HtmlElement *HtmlParser::FindElementByNameNS(const char *name, const char *ns, HtmlElement *from) { HtmlElement *el = from ? from : rootElement; if (from) goto FindNext; if (!el) return NULL; CheckNext: if (el->NameIs(name) || ns && el->NameIsNS(name, ns)) return el; FindNext: if (el->down) { el = el->down; goto CheckNext; } if (el->next) { el = el->next; goto CheckNext; } // backup in the tree HtmlElement *parent = el->up; while (parent) { if (parent->next) { el = parent->next; goto CheckNext; } parent = parent->up; } return NULL; }
static void HtmlParser01() { HtmlParser p; HtmlElement *root = p.Parse("<A><bAh></a>"); utassert(p.ElementsCount() == 2); utassert(Tag_A == root->tag && !root->name); utassert(NULL == root->up); utassert(NULL == root->next); HtmlElement *el = root->down; utassert(NULL == el->firstAttr); utassert(el->NameIs("bah") && el->NameIs("BAH")); utassert(Tag_NotFound == el->tag && str::Eq("bAh", el->name)); utassert(el->up == root); utassert(NULL == el->down); utassert(NULL == el->next); }
static void HtmlParser10() { HtmlParser p; HtmlElement *root = p.Parse("<!xml version='1.0'?><x:a xmlns:x=''><x:b attr='val'/></x:a>"); utassert(2 == p.ElementsCount()); utassert(2 == p.TotalAttrCount()); utassert(root->NameIs("x:a") && root->NameIsNS("a", "")); HtmlElement *node = p.FindElementByName("b"); utassert(!node); node = p.FindElementByNameNS("b", ""); utassert(node); utassert(node->NameIs("x:b") && node->NameIsNS("b", "")); ScopedMem<WCHAR> val(node->GetAttribute("attr")); utassert(str::Eq(val, L"val")); // TODO: XML tags are case sensitive (HTML tags aren't) node = p.FindElementByName("X:B"); utassert(node && node->NameIs("X:B")); }
HtmlElement *HtmlElement::GetChildByName(const char *name, int idx) const { for (HtmlElement *el = down; el; el = el->next) { if (el->NameIs(name)) { if (0 == idx) return el; idx--; } } return NULL; }
static void HtmlParser11() { HtmlParser p; HtmlElement *root = p.Parse("<root/><!-- comment -->"); utassert(1 == p.ElementsCount()); utassert(0 == p.TotalAttrCount()); utassert(root && root->NameIs("root")); root = p.Parse("<root><!---></root>"); utassert(!root); }
static void HtmlParserFile() { WCHAR *fileName = L"HtmlParseTest00.html"; // We assume we're being run from obj-[dbg|rel], so the test // files are in ..\src\utils directory relative to exe's dir ScopedMem<WCHAR> exePath(GetExePath()); const WCHAR *exeDir = path::GetBaseName(exePath); ScopedMem<WCHAR> p1(path::Join(exeDir, L"..\\src\\utils")); ScopedMem<WCHAR> p2(path::Join(p1, fileName)); char *d = file::ReadAll(p2, NULL); // it's ok if we fail - we assume we were not run from the // right location if (!d) return; HtmlParser p; HtmlElement *root = p.ParseInPlace(d); utassert(root); utassert(709 == p.ElementsCount()); utassert(955 == p.TotalAttrCount()); utassert(root->NameIs("html")); HtmlElement *el = root->down; utassert(el->NameIs("head")); el = el->next; utassert(el->NameIs("body")); el = el->down; utassert(el->NameIs("object")); el = el->next; utassert(el->NameIs("ul")); el = el->down; utassert(el->NameIs("li")); el = el->down; utassert(el->NameIs("object")); ScopedMem<WCHAR> val(el->GetAttribute("type")); utassert(str::Eq(val, L"text/sitemap")); el = el->down; utassert(el->NameIs("param")); utassert(!el->down); utassert(el->next->NameIs("param")); el = p.FindElementByName("body"); utassert(el); el = p.FindElementByName("ul", el); utassert(el); int count = 0; while (el) { ++count; el = p.FindElementByName("ul", el); } utassert(18 == count); free(d); }
void HtmlParser::CloseTag(char *tagName) { str::ToLower(tagName); // to allow for lack of closing tags, e.g. in case like // <a><b><c></a>, we look for the first parent with matching name for (HtmlElement *el = currElement; el; el = el->up) { if (el->NameIs(tagName)) { currElement = el->up; return; } } // ignore the unexpected closing tag }
static void HtmlParser03() { HtmlParser p; HtmlElement *root = p.Parse("<el att =v"al/>"); utassert(1 == p.ElementsCount()); utassert(1 == p.TotalAttrCount()); utassert(root->NameIs("el")); utassert(NULL == root->next); utassert(NULL == root->up); utassert(NULL == root->down); ScopedMem<WCHAR> val(root->GetAttribute("att")); utassert(str::Eq(val, L"v\"al")); utassert(!root->firstAttr->next); }
void HtmlParser::CloseTag(HtmlToken *tok) { char *tagName = NULL; if (Tag_NotFound == tok->tag) { tagName = (char *)tok->s; char *tagEnd = tagName + tok->nLen; *tagEnd = '\0'; } // to allow for lack of closing tags, e.g. in case like // <a><b><c></a>, we look for the first parent with matching name for (HtmlElement *el = currElement; el; el = el->up) { if (tagName ? el->NameIs(tagName) : tok->tag == el->tag) { currElement = el->up; return; } } // ignore the unexpected closing tag }
static void HtmlParser06() { HtmlParser p; HtmlElement *root = p.Parse("<ul><p>ignore<li><br><meta><li><ol><li></ul><dropme>"); utassert(9 == p.ElementsCount()); utassert(0 == p.TotalAttrCount()); utassert(root->NameIs("ul")); utassert(!root->next); HtmlElement *el = root->GetChildByTag(Tag_Li); utassert(el); utassert(el->down->NameIs("br")); utassert(el->down->next->NameIs("meta")); utassert(!el->down->next->next); el = root->GetChildByTag(Tag_Li, 1); utassert(el); utassert(!el->next); el = el->GetChildByTag(Tag_Ol); utassert(!el->next); utassert(el->down->NameIs("li")); utassert(!el->down->down); }
static void WalkChmTocOrIndex(EbookTocVisitor *visitor, HtmlElement *list, UINT cp, bool isIndex, int level=1) { CrashIf(!list->NameIs("ul")); // some broken ToCs wrap every <li> into its own <ul> for (; list && list->NameIs("ul"); list = list->next) { for (HtmlElement *el = list->down; el; el = el->next) { if (!el->NameIs("li")) continue; // ignore unexpected elements bool valid = (isIndex ? VisitChmIndexItem : VisitChmTocItem)(visitor, el, cp, level); if (!valid) continue; // skip incomplete elements and all their children HtmlElement *nested = el->GetChildByName("ul"); // some broken ToCs have the <ul> follow right *after* a <li> if (!nested && el->next && el->next->NameIs("ul")) nested = el->next; if (nested) WalkChmTocOrIndex(visitor, nested, cp, isIndex, level + 1); } } }