HtmlElement *HtmlParser::FindElementByNameNS(const char *name, const char *ns, HtmlElement *from)
{
    HtmlElement *el = from ? from : rootElement;
    if (from)
        goto FindNext;
    if (!el)
        return NULL;
CheckNext:
    if (el->NameIs(name) || ns && el->NameIsNS(name, ns))
        return el;
FindNext:
    if (el->down) {
        el = el->down;
        goto CheckNext;
    }
    if (el->next) {
        el = el->next;
        goto CheckNext;
    }
    // backup in the tree
    HtmlElement *parent = el->up;
    while (parent) {
        if (parent->next) {
            el = parent->next;
            goto CheckNext;
        }
        parent = parent->up;
    }
    return NULL;
}
Ejemplo n.º 2
0
static void HtmlParser02()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<a><b/><c></c  ><d at1=\"&lt;quo&amp;ted&gt;\" at2='also quoted'   att3=notquoted att4=&#101;&#x6e;d/></a>");
    assert(4 == p.ElementsCount());
    assert(4 == p.TotalAttrCount());
    assert(str::Eq("a", root->name));
    assert(NULL == root->next);
    HtmlElement *el = root->down;
    assert(str::Eq("b", el->name));
    assert(root == el->up);
    el = el->next;
    assert(str::Eq("c", el->name));
    assert(root == el->up);
    el = el->next;
    assert(str::Eq("d", el->name));
    assert(NULL == el->next);
    assert(root == el->up);
    ScopedMem<TCHAR> val(el->GetAttribute("at1"));
    assert(str::Eq(val, _T("<quo&ted>")));
    val.Set(el->GetAttribute("at2"));
    assert(str::Eq(val, _T("also quoted")));
    val.Set(el->GetAttribute("att3"));
    assert(str::Eq(val, _T("notquoted")));
    val.Set(el->GetAttribute("att4"));
    assert(str::Eq(val, _T("end")));
}
static void HtmlParser05()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<!doctype><html><HEAD><meta name=foo></head><body><object t=la><param name=foo val=bar></object><ul><li></ul></object></body></Html>");
    utassert(8 == p.ElementsCount());
    utassert(4 == p.TotalAttrCount());
    utassert(root->NameIs("html"));
    utassert(NULL == root->up);
    utassert(NULL == root->next);
    HtmlElement *el = root->down;
    utassert(el->NameIs("head"));
    HtmlElement *el2 = el->down;
    utassert(el2->NameIs("meta"));
    utassert(NULL == el2->next);
    utassert(NULL == el2->down);
    el2 = el->next;
    utassert(el2->NameIs("body"));
    utassert(NULL == el2->next);
    el2 = el2->down;
    utassert(el2->NameIs("object"));
    el = p.FindElementByName("html");
    utassert(el);
    el = p.FindElementByName("head", el);
    utassert(el);
    utassert(el->NameIs("head"));
    el = p.FindElementByName("ul", el);
    utassert(el);
}
Ejemplo n.º 4
0
static void WalkChmTocOrIndex(EbookTocVisitor *visitor, HtmlElement *list, UINT cp, bool isIndex, int level=1)
{
    CrashIf(Tag_Ul != list->tag);

    // some broken ToCs wrap every <li> into its own <ul>
    for (; list && Tag_Ul == list->tag; list = list->next) {
        for (HtmlElement *el = list->down; el; el = el->next) {
            if (Tag_Li != el->tag)
                continue; // ignore unexpected elements

            bool valid;
            HtmlElement *elObj = el->GetChildByTag(Tag_Object);
            if (!elObj)
                valid = false;
            else if (isIndex)
                valid = VisitChmIndexItem(visitor, elObj, cp, level);
            else
                valid = VisitChmTocItem(visitor, elObj, cp, level);
            if (!valid)
                continue; // skip incomplete elements and all their children

            HtmlElement *nested = el->GetChildByTag(Tag_Ul);
            // some broken ToCs have the <ul> follow right *after* a <li>
            if (!nested && el->next && Tag_Ul == el->next->tag)
                nested = el->next;
            if (nested)
                WalkChmTocOrIndex(visitor, nested, cp, isIndex, level + 1);
        }
    }
}
static void HtmlParser07()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<test umls=&auml;\xC3\xB6&#xFC; Zero=&#1;&#0;&#-1;>", CP_UTF8);
    utassert(1 == p.ElementsCount());
    ScopedMem<WCHAR> val(root->GetAttribute("umls"));
    utassert(str::Eq(val, L"\xE4\xF6\xFC"));
    val.Set(root->GetAttribute("zerO"));
    utassert(str::Eq(val, L"\x01??"));
}
HtmlElement *HtmlElement::GetChildByName(const char *name, int idx) const
{
    for (HtmlElement *el = down; el; el = el->next) {
        if (el->NameIs(name)) {
            if (0 == idx)
                return el;
            idx--;
        }
    }
    return NULL;
}
static void HtmlParser11()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<root/><!-- comment -->");
    utassert(1 == p.ElementsCount());
    utassert(0 == p.TotalAttrCount());
    utassert(root && root->NameIs("root"));

    root = p.Parse("<root><!---></root>");
    utassert(!root);
}
HtmlElement *HtmlParser::FindParent(char *tagName)
{
    if (str::Eq(tagName, "li")) {
        // make a list item the child of the closest list
        for (HtmlElement *el = currElement; el; el = el->up) {
            if (el->NameIs("ul") || el->NameIs("ol"))
                return el;
        }
    }

    return currElement;
}
void HtmlParser::CloseTag(char *tagName)
{
    str::ToLower(tagName);
    // to allow for lack of closing tags, e.g. in case like
    // <a><b><c></a>, we look for the first parent with matching name
    for (HtmlElement *el = currElement; el; el = el->up) {
        if (el->NameIs(tagName)) {
            currElement = el->up;
            return;
        }
    }
    // ignore the unexpected closing tag
}
Ejemplo n.º 10
0
static void HtmlParser03()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<el   att  =v&quot;al/>");
    assert(1 == p.ElementsCount());
    assert(1 == p.TotalAttrCount());
    assert(str::Eq("el", root->name));
    assert(NULL == root->next);
    assert(NULL == root->up);
    assert(NULL == root->down);
    ScopedMem<TCHAR> val(root->GetAttribute("att"));
    assert(str::Eq(val, _T("v\"al")));
    assert(!root->firstAttr->next);
}
static void HtmlParser04()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<el att=  va&apos;l></ el >");
    utassert(1 == p.ElementsCount());
    utassert(1 == p.TotalAttrCount());
    utassert(root->NameIs("el"));
    utassert(NULL == root->next);
    utassert(NULL == root->up);
    utassert(NULL == root->down);
    ScopedMem<WCHAR> val(root->GetAttribute("att"));
    utassert(str::Eq(val, L"va'l"));
    utassert(!root->firstAttr->next);
}
Ejemplo n.º 12
0
static void HtmlParserFile()
{
    TCHAR *fileName = _T("HtmlParseTest00.html");
    // We assume we're being run from obj-[dbg|rel], so the test
    // files are in ..\src\utils directory relative to exe's dir
    ScopedMem<TCHAR> exePath(GetExePath());
    const TCHAR *exeDir = path::GetBaseName(exePath);
    ScopedMem<TCHAR> p1(path::Join(exeDir, _T("..\\src\\utils")));
    ScopedMem<TCHAR> p2(path::Join(p1, fileName));
    char *d = file::ReadAll(p2, NULL);
    // it's ok if we fail - we assume we were not run from the
    // right location
    if (!d)
        return;
    HtmlParser p;
    HtmlElement *root = p.ParseInPlace(d);
    assert(root);
    assert(709 == p.ElementsCount());
    assert(955 == p.TotalAttrCount());
    assert(str::Eq(root->name, "html"));
    HtmlElement *el = root->down;
    assert(str::Eq(el->name, "head"));
    el = el->next;
    assert(str::Eq(el->name, "body"));
    el = el->down;
    assert(str::Eq(el->name, "object"));
    el = el->next;
    assert(str::Eq(el->name, "ul"));
    el = el->down;
    assert(str::Eq(el->name, "li"));
    el = el->down;
    assert(str::Eq(el->name, "object"));
    ScopedMem<TCHAR> val(el->GetAttribute("type"));
    assert(str::Eq(val, _T("text/sitemap")));
    el = el->down;
    assert(str::Eq(el->name, "param"));
    assert(!el->down);
    assert(str::Eq(el->next->name, "param"));
    el = p.FindElementByName("body");
    assert(el);
    el = p.FindElementByName("ul", el);
    assert(el);
    int count = 0;
    while (el) {
        ++count;
        el = p.FindElementByName("ul", el);
    }
    assert(18 == count);
    free(d);
}
Ejemplo n.º 13
0
static void HtmlParser07()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<test umls=&auml;\xC3\xB6&#xFC; zero=&#1;&#0;&#-1;>", CP_UTF8);
    assert(1 == p.ElementsCount());
    ScopedMem<TCHAR> val(root->GetAttribute("umls"));
#ifdef UNICODE
    assert(str::Eq(val, L"\xE4\xF6\xFC"));
#else
    assert(str::EndsWith(val, "\xFC"));
#endif
    val.Set(root->GetAttribute("zero"));
    assert(str::Eq(val, _T("\x01??")));
}
static void HtmlParser00()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<a></A>");
    utassert(p.ElementsCount() == 1);
    utassert(root);
    utassert(Tag_A == root->tag && !root->name);
    utassert(root->NameIs("a"));

    root = p.Parse("<b></B>");
    utassert(p.ElementsCount() == 1);
    utassert(root);
    utassert(Tag_B == root->tag && !root->name);
    utassert(root->NameIs("b"));
}
Ejemplo n.º 15
0
static void HtmlParser09()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<?xml version='1.0'?><!-- <html><body></html> --><root attr='<!-- comment -->' />");
    assert(1 == p.ElementsCount());
    assert(1 == p.TotalAttrCount());
    assert(str::Eq("root", root->name));
    ScopedMem<TCHAR> val(root->GetAttribute("attr"));
    assert(str::Eq(val, _T("<!-- comment -->")));

    root = p.Parse("<!-- comment with \" and \' --><main />");
    assert(1 == p.ElementsCount());
    assert(0 == p.TotalAttrCount());
    assert(str::Eq("main", root->name));
}
Ejemplo n.º 16
0
// ignores any <ul><li> list structure and just extracts a linear list of <object type="text/sitemap">...</object>
static bool WalkBrokenChmTocOrIndex(EbookTocVisitor* visitor, HtmlParser& p, UINT cp, bool isIndex) {
    bool hadOne = false;

    HtmlElement* el = p.FindElementByName("body");
    while ((el = p.FindElementByName("object", el)) != nullptr) {
        AutoFreeW type(el->GetAttribute("type"));
        if (!str::EqI(type, L"text/sitemap"))
            continue;
        if (isIndex)
            hadOne |= VisitChmIndexItem(visitor, el, cp, 1);
        else
            hadOne |= VisitChmTocItem(visitor, el, cp, 1);
    }

    return hadOne;
}
static void HtmlParser01()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<A><bAh></a>");
    utassert(p.ElementsCount() == 2);
    utassert(Tag_A == root->tag && !root->name);
    utassert(NULL == root->up);
    utassert(NULL == root->next);
    HtmlElement *el = root->down;
    utassert(NULL == el->firstAttr);
    utassert(el->NameIs("bah") && el->NameIs("BAH"));
    utassert(Tag_NotFound == el->tag && str::Eq("bAh", el->name));
    utassert(el->up == root);
    utassert(NULL == el->down);
    utassert(NULL == el->next);
}
Ejemplo n.º 18
0
void HtmlParser::CloseTag(HtmlToken *tok)
{
    char *tagName = NULL;
    if (Tag_NotFound == tok->tag) {
        tagName = (char *)tok->s;
        char *tagEnd = tagName + tok->nLen;
        *tagEnd = '\0';
    }

    // to allow for lack of closing tags, e.g. in case like
    // <a><b><c></a>, we look for the first parent with matching name
    for (HtmlElement *el = currElement; el; el = el->up) {
        if (tagName ? el->NameIs(tagName) : tok->tag == el->tag) {
            currElement = el->up;
            return;
        }
    }
    // ignore the unexpected closing tag
}
Ejemplo n.º 19
0
static void HtmlParser06()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<ul><p>ignore<li><br><meta><li><ol><li></ul><dropme>");
    assert(9 == p.ElementsCount());
    assert(0 == p.TotalAttrCount());
    assert(str::Eq("ul", root->name));
    assert(!root->next);
    HtmlElement *el = root->GetChildByName("li");
    assert(el);
    assert(str::Eq(el->down->name, "br"));
    assert(str::Eq(el->down->next->name, "meta"));
    assert(!el->down->next->next);
    el = root->GetChildByName("li", 1);
    assert(el);
    assert(!el->next);
    el = el->GetChildByName("ol");
    assert(!el->next);
    assert(str::Eq(el->down->name, "li"));
    assert(!el->down->down);
}
bool MobiEngineImpl::Load(const WCHAR *fileName)
{
    this->fileName = str::Dup(fileName);

    doc = MobiDoc::CreateFromFile(fileName);
    if (!doc || Pdb_Mobipocket != doc->GetDocType())
        return false;

    HtmlFormatterArgs args;
    args.htmlStr = doc->GetBookHtmlData(args.htmlStrLen);
    args.pageDx = (float)pageRect.dx - 2 * pageBorder;
    args.pageDy = (float)pageRect.dy - 2 * pageBorder;
    args.fontName = DEFAULT_FONT_NAME;
    args.fontSize = DEFAULT_FONT_SIZE;
    args.textAllocator = &allocator;
    args.measureAlgo = MeasureTextQuick;

    pages = MobiFormatter(&args, doc).FormatAllPages();
    if (!ExtractPageAnchors())
        return false;

    HtmlParser parser;
    if (parser.Parse(args.htmlStr)) {
        HtmlElement *ref = NULL;
        while ((ref = parser.FindElementByName("reference", ref))) {
            ScopedMem<WCHAR> type(ref->GetAttribute("type"));
            ScopedMem<WCHAR> filepos(ref->GetAttribute("filepos"));
            if (str::EqI(type, L"toc") && filepos) {
                unsigned int pos;
                if (str::Parse(filepos, L"%u%$", &pos) && pos < args.htmlStrLen) {
                    tocReparsePoint = args.htmlStr + pos;
                    break;
                }
            }
        }
    }

    return pages->Count() > 0;
}
Ejemplo n.º 21
0
static void WalkChmTocOrIndex(EbookTocVisitor *visitor, HtmlElement *list, UINT cp, bool isIndex, int level=1)
{
    CrashIf(!list->NameIs("ul"));

    // some broken ToCs wrap every <li> into its own <ul>
    for (; list && list->NameIs("ul"); list = list->next) {
        for (HtmlElement *el = list->down; el; el = el->next) {
            if (!el->NameIs("li"))
                continue; // ignore unexpected elements
            bool valid = (isIndex ? VisitChmIndexItem : VisitChmTocItem)(visitor, el, cp, level);
            if (!valid)
                continue; // skip incomplete elements and all their children

            HtmlElement *nested = el->GetChildByName("ul");
            // some broken ToCs have the <ul> follow right *after* a <li>
            if (!nested && el->next && el->next->NameIs("ul"))
                nested = el->next;
            if (nested)
                WalkChmTocOrIndex(visitor, nested, cp, isIndex, level + 1);
        }
    }
}
static void HtmlParser10()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<!xml version='1.0'?><x:a xmlns:x='http://example.org/ns/x'><x:b attr='val'/></x:a>");
    utassert(2 == p.ElementsCount());
    utassert(2 == p.TotalAttrCount());
    utassert(root->NameIs("x:a") && root->NameIsNS("a", "http://example.org/ns/x"));

    HtmlElement *node = p.FindElementByName("b");
    utassert(!node);
    node = p.FindElementByNameNS("b", "http://example.org/ns/x");
    utassert(node);
    utassert(node->NameIs("x:b") && node->NameIsNS("b", "http://example.org/ns/x"));
    ScopedMem<WCHAR> val(node->GetAttribute("attr"));
    utassert(str::Eq(val, L"val"));
    // TODO: XML tags are case sensitive (HTML tags aren't)
    node = p.FindElementByName("X:B");
    utassert(node && node->NameIs("X:B"));
}
static void HtmlParser06()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<ul><p>ignore<li><br><meta><li><ol><li></ul><dropme>");
    utassert(9 == p.ElementsCount());
    utassert(0 == p.TotalAttrCount());
    utassert(root->NameIs("ul"));
    utassert(!root->next);
    HtmlElement *el = root->GetChildByTag(Tag_Li);
    utassert(el);
    utassert(el->down->NameIs("br"));
    utassert(el->down->next->NameIs("meta"));
    utassert(!el->down->next->next);
    el = root->GetChildByTag(Tag_Li, 1);
    utassert(el);
    utassert(!el->next);
    el = el->GetChildByTag(Tag_Ol);
    utassert(!el->next);
    utassert(el->down->NameIs("li"));
    utassert(!el->down->down);
}