static void HtmlParser02()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<a><b/><c></c  ><d at1=\"&lt;quo&amp;ted&gt;\" at2='also quoted'   att3=notquoted att4=&#101;&#x6e;d/></a>");
    utassert(4 == p.ElementsCount());
    utassert(4 == p.TotalAttrCount());
    utassert(root->NameIs("a"));
    utassert(NULL == root->next);
    HtmlElement *el = root->down;
    utassert(el->NameIs("b"));
    utassert(root == el->up);
    el = el->next;
    utassert(el->NameIs("c"));
    utassert(root == el->up);
    el = el->next;
    utassert(el->NameIs("d"));
    utassert(NULL == el->next);
    utassert(root == el->up);
    ScopedMem<WCHAR> val(el->GetAttribute("at1"));
    utassert(str::Eq(val, L"<quo&ted>"));
    val.Set(el->GetAttribute("at2"));
    utassert(str::Eq(val, L"also quoted"));
    val.Set(el->GetAttribute("att3"));
    utassert(str::Eq(val, L"notquoted"));
    val.Set(el->GetAttribute("att4"));
    utassert(str::Eq(val, L"end"));
}
static void HtmlParser05()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<!doctype><html><HEAD><meta name=foo></head><body><object t=la><param name=foo val=bar></object><ul><li></ul></object></body></Html>");
    utassert(8 == p.ElementsCount());
    utassert(4 == p.TotalAttrCount());
    utassert(root->NameIs("html"));
    utassert(NULL == root->up);
    utassert(NULL == root->next);
    HtmlElement *el = root->down;
    utassert(el->NameIs("head"));
    HtmlElement *el2 = el->down;
    utassert(el2->NameIs("meta"));
    utassert(NULL == el2->next);
    utassert(NULL == el2->down);
    el2 = el->next;
    utassert(el2->NameIs("body"));
    utassert(NULL == el2->next);
    el2 = el2->down;
    utassert(el2->NameIs("object"));
    el = p.FindElementByName("html");
    utassert(el);
    el = p.FindElementByName("head", el);
    utassert(el);
    utassert(el->NameIs("head"));
    el = p.FindElementByName("ul", el);
    utassert(el);
}
HtmlElement *HtmlParser::FindParent(char *tagName)
{
    if (str::Eq(tagName, "li")) {
        // make a list item the child of the closest list
        for (HtmlElement *el = currElement; el; el = el->up) {
            if (el->NameIs("ul") || el->NameIs("ol"))
                return el;
        }
    }

    return currElement;
}
static void HtmlParser09()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<?xml version='1.0'?><!-- <html><body></html> --><root attr='<!-- comment -->' />");
    utassert(1 == p.ElementsCount());
    utassert(1 == p.TotalAttrCount());
    utassert(root->NameIs("root"));
    ScopedMem<WCHAR> val(root->GetAttribute("attr"));
    utassert(str::Eq(val, L"<!-- comment -->"));

    root = p.Parse("<!-- comment with \" and \' --><main />");
    utassert(1 == p.ElementsCount());
    utassert(0 == p.TotalAttrCount());
    utassert(root->NameIs("main"));
}
static void HtmlParser00()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<a></A>");
    utassert(p.ElementsCount() == 1);
    utassert(root);
    utassert(Tag_A == root->tag && !root->name);
    utassert(root->NameIs("a"));

    root = p.Parse("<b></B>");
    utassert(p.ElementsCount() == 1);
    utassert(root);
    utassert(Tag_B == root->tag && !root->name);
    utassert(root->NameIs("b"));
}
HtmlElement *HtmlParser::FindElementByNameNS(const char *name, const char *ns, HtmlElement *from)
{
    HtmlElement *el = from ? from : rootElement;
    if (from)
        goto FindNext;
    if (!el)
        return NULL;
CheckNext:
    if (el->NameIs(name) || ns && el->NameIsNS(name, ns))
        return el;
FindNext:
    if (el->down) {
        el = el->down;
        goto CheckNext;
    }
    if (el->next) {
        el = el->next;
        goto CheckNext;
    }
    // backup in the tree
    HtmlElement *parent = el->up;
    while (parent) {
        if (parent->next) {
            el = parent->next;
            goto CheckNext;
        }
        parent = parent->up;
    }
    return NULL;
}
static void HtmlParser01()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<A><bAh></a>");
    utassert(p.ElementsCount() == 2);
    utassert(Tag_A == root->tag && !root->name);
    utassert(NULL == root->up);
    utassert(NULL == root->next);
    HtmlElement *el = root->down;
    utassert(NULL == el->firstAttr);
    utassert(el->NameIs("bah") && el->NameIs("BAH"));
    utassert(Tag_NotFound == el->tag && str::Eq("bAh", el->name));
    utassert(el->up == root);
    utassert(NULL == el->down);
    utassert(NULL == el->next);
}
static void HtmlParser10()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<!xml version='1.0'?><x:a xmlns:x='http://example.org/ns/x'><x:b attr='val'/></x:a>");
    utassert(2 == p.ElementsCount());
    utassert(2 == p.TotalAttrCount());
    utassert(root->NameIs("x:a") && root->NameIsNS("a", "http://example.org/ns/x"));

    HtmlElement *node = p.FindElementByName("b");
    utassert(!node);
    node = p.FindElementByNameNS("b", "http://example.org/ns/x");
    utassert(node);
    utassert(node->NameIs("x:b") && node->NameIsNS("b", "http://example.org/ns/x"));
    ScopedMem<WCHAR> val(node->GetAttribute("attr"));
    utassert(str::Eq(val, L"val"));
    // TODO: XML tags are case sensitive (HTML tags aren't)
    node = p.FindElementByName("X:B");
    utassert(node && node->NameIs("X:B"));
}
HtmlElement *HtmlElement::GetChildByName(const char *name, int idx) const
{
    for (HtmlElement *el = down; el; el = el->next) {
        if (el->NameIs(name)) {
            if (0 == idx)
                return el;
            idx--;
        }
    }
    return NULL;
}
static void HtmlParser11()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<root/><!-- comment -->");
    utassert(1 == p.ElementsCount());
    utassert(0 == p.TotalAttrCount());
    utassert(root && root->NameIs("root"));

    root = p.Parse("<root><!---></root>");
    utassert(!root);
}
static void HtmlParserFile()
{
    WCHAR *fileName = L"HtmlParseTest00.html";
    // We assume we're being run from obj-[dbg|rel], so the test
    // files are in ..\src\utils directory relative to exe's dir
    ScopedMem<WCHAR> exePath(GetExePath());
    const WCHAR *exeDir = path::GetBaseName(exePath);
    ScopedMem<WCHAR> p1(path::Join(exeDir, L"..\\src\\utils"));
    ScopedMem<WCHAR> p2(path::Join(p1, fileName));
    char *d = file::ReadAll(p2, NULL);
    // it's ok if we fail - we assume we were not run from the
    // right location
    if (!d)
        return;
    HtmlParser p;
    HtmlElement *root = p.ParseInPlace(d);
    utassert(root);
    utassert(709 == p.ElementsCount());
    utassert(955 == p.TotalAttrCount());
    utassert(root->NameIs("html"));
    HtmlElement *el = root->down;
    utassert(el->NameIs("head"));
    el = el->next;
    utassert(el->NameIs("body"));
    el = el->down;
    utassert(el->NameIs("object"));
    el = el->next;
    utassert(el->NameIs("ul"));
    el = el->down;
    utassert(el->NameIs("li"));
    el = el->down;
    utassert(el->NameIs("object"));
    ScopedMem<WCHAR> val(el->GetAttribute("type"));
    utassert(str::Eq(val, L"text/sitemap"));
    el = el->down;
    utassert(el->NameIs("param"));
    utassert(!el->down);
    utassert(el->next->NameIs("param"));
    el = p.FindElementByName("body");
    utassert(el);
    el = p.FindElementByName("ul", el);
    utassert(el);
    int count = 0;
    while (el) {
        ++count;
        el = p.FindElementByName("ul", el);
    }
    utassert(18 == count);
    free(d);
}
void HtmlParser::CloseTag(char *tagName)
{
    str::ToLower(tagName);
    // to allow for lack of closing tags, e.g. in case like
    // <a><b><c></a>, we look for the first parent with matching name
    for (HtmlElement *el = currElement; el; el = el->up) {
        if (el->NameIs(tagName)) {
            currElement = el->up;
            return;
        }
    }
    // ignore the unexpected closing tag
}
static void HtmlParser03()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<el   att  =v&quot;al/>");
    utassert(1 == p.ElementsCount());
    utassert(1 == p.TotalAttrCount());
    utassert(root->NameIs("el"));
    utassert(NULL == root->next);
    utassert(NULL == root->up);
    utassert(NULL == root->down);
    ScopedMem<WCHAR> val(root->GetAttribute("att"));
    utassert(str::Eq(val, L"v\"al"));
    utassert(!root->firstAttr->next);
}
void HtmlParser::CloseTag(HtmlToken *tok)
{
    char *tagName = NULL;
    if (Tag_NotFound == tok->tag) {
        tagName = (char *)tok->s;
        char *tagEnd = tagName + tok->nLen;
        *tagEnd = '\0';
    }

    // to allow for lack of closing tags, e.g. in case like
    // <a><b><c></a>, we look for the first parent with matching name
    for (HtmlElement *el = currElement; el; el = el->up) {
        if (tagName ? el->NameIs(tagName) : tok->tag == el->tag) {
            currElement = el->up;
            return;
        }
    }
    // ignore the unexpected closing tag
}
static void HtmlParser06()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<ul><p>ignore<li><br><meta><li><ol><li></ul><dropme>");
    utassert(9 == p.ElementsCount());
    utassert(0 == p.TotalAttrCount());
    utassert(root->NameIs("ul"));
    utassert(!root->next);
    HtmlElement *el = root->GetChildByTag(Tag_Li);
    utassert(el);
    utassert(el->down->NameIs("br"));
    utassert(el->down->next->NameIs("meta"));
    utassert(!el->down->next->next);
    el = root->GetChildByTag(Tag_Li, 1);
    utassert(el);
    utassert(!el->next);
    el = el->GetChildByTag(Tag_Ol);
    utassert(!el->next);
    utassert(el->down->NameIs("li"));
    utassert(!el->down->down);
}
Exemple #16
0
static void WalkChmTocOrIndex(EbookTocVisitor *visitor, HtmlElement *list, UINT cp, bool isIndex, int level=1)
{
    CrashIf(!list->NameIs("ul"));

    // some broken ToCs wrap every <li> into its own <ul>
    for (; list && list->NameIs("ul"); list = list->next) {
        for (HtmlElement *el = list->down; el; el = el->next) {
            if (!el->NameIs("li"))
                continue; // ignore unexpected elements
            bool valid = (isIndex ? VisitChmIndexItem : VisitChmTocItem)(visitor, el, cp, level);
            if (!valid)
                continue; // skip incomplete elements and all their children

            HtmlElement *nested = el->GetChildByName("ul");
            // some broken ToCs have the <ul> follow right *after* a <li>
            if (!nested && el->next && el->next->NameIs("ul"))
                nested = el->next;
            if (nested)
                WalkChmTocOrIndex(visitor, nested, cp, isIndex, level + 1);
        }
    }
}