static void HtmlParser02()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<a><b/><c></c  ><d at1=\"&lt;quo&amp;ted&gt;\" at2='also quoted'   att3=notquoted att4=&#101;&#x6e;d/></a>");
    assert(4 == p.ElementsCount());
    assert(4 == p.TotalAttrCount());
    assert(str::Eq("a", root->name));
    assert(NULL == root->next);
    HtmlElement *el = root->down;
    assert(str::Eq("b", el->name));
    assert(root == el->up);
    el = el->next;
    assert(str::Eq("c", el->name));
    assert(root == el->up);
    el = el->next;
    assert(str::Eq("d", el->name));
    assert(NULL == el->next);
    assert(root == el->up);
    ScopedMem<TCHAR> val(el->GetAttribute("at1"));
    assert(str::Eq(val, _T("<quo&ted>")));
    val.Set(el->GetAttribute("at2"));
    assert(str::Eq(val, _T("also quoted")));
    val.Set(el->GetAttribute("att3"));
    assert(str::Eq(val, _T("notquoted")));
    val.Set(el->GetAttribute("att4"));
    assert(str::Eq(val, _T("end")));
}
static void HtmlParser07()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<test umls=&auml;\xC3\xB6&#xFC; Zero=&#1;&#0;&#-1;>", CP_UTF8);
    utassert(1 == p.ElementsCount());
    ScopedMem<WCHAR> val(root->GetAttribute("umls"));
    utassert(str::Eq(val, L"\xE4\xF6\xFC"));
    val.Set(root->GetAttribute("zerO"));
    utassert(str::Eq(val, L"\x01??"));
}
static void HtmlParser07()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<test umls=&auml;\xC3\xB6&#xFC; zero=&#1;&#0;&#-1;>", CP_UTF8);
    assert(1 == p.ElementsCount());
    ScopedMem<TCHAR> val(root->GetAttribute("umls"));
#ifdef UNICODE
    assert(str::Eq(val, L"\xE4\xF6\xFC"));
#else
    assert(str::EndsWith(val, "\xFC"));
#endif
    val.Set(root->GetAttribute("zero"));
    assert(str::Eq(val, _T("\x01??")));
}
bool MobiEngineImpl::Load(const WCHAR *fileName)
{
    this->fileName = str::Dup(fileName);

    doc = MobiDoc::CreateFromFile(fileName);
    if (!doc || Pdb_Mobipocket != doc->GetDocType())
        return false;

    HtmlFormatterArgs args;
    args.htmlStr = doc->GetBookHtmlData(args.htmlStrLen);
    args.pageDx = (float)pageRect.dx - 2 * pageBorder;
    args.pageDy = (float)pageRect.dy - 2 * pageBorder;
    args.fontName = DEFAULT_FONT_NAME;
    args.fontSize = DEFAULT_FONT_SIZE;
    args.textAllocator = &allocator;
    args.measureAlgo = MeasureTextQuick;

    pages = MobiFormatter(&args, doc).FormatAllPages();
    if (!ExtractPageAnchors())
        return false;

    HtmlParser parser;
    if (parser.Parse(args.htmlStr)) {
        HtmlElement *ref = NULL;
        while ((ref = parser.FindElementByName("reference", ref))) {
            ScopedMem<WCHAR> type(ref->GetAttribute("type"));
            ScopedMem<WCHAR> filepos(ref->GetAttribute("filepos"));
            if (str::EqI(type, L"toc") && filepos) {
                unsigned int pos;
                if (str::Parse(filepos, L"%u%$", &pos) && pos < args.htmlStrLen) {
                    tocReparsePoint = args.htmlStr + pos;
                    break;
                }
            }
        }
    }

    return pages->Count() > 0;
}
static void HtmlParser03()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<el   att  =v&quot;al/>");
    assert(1 == p.ElementsCount());
    assert(1 == p.TotalAttrCount());
    assert(str::Eq("el", root->name));
    assert(NULL == root->next);
    assert(NULL == root->up);
    assert(NULL == root->down);
    ScopedMem<TCHAR> val(root->GetAttribute("att"));
    assert(str::Eq(val, _T("v\"al")));
    assert(!root->firstAttr->next);
}
static void HtmlParserFile()
{
    TCHAR *fileName = _T("HtmlParseTest00.html");
    // We assume we're being run from obj-[dbg|rel], so the test
    // files are in ..\src\utils directory relative to exe's dir
    ScopedMem<TCHAR> exePath(GetExePath());
    const TCHAR *exeDir = path::GetBaseName(exePath);
    ScopedMem<TCHAR> p1(path::Join(exeDir, _T("..\\src\\utils")));
    ScopedMem<TCHAR> p2(path::Join(p1, fileName));
    char *d = file::ReadAll(p2, NULL);
    // it's ok if we fail - we assume we were not run from the
    // right location
    if (!d)
        return;
    HtmlParser p;
    HtmlElement *root = p.ParseInPlace(d);
    assert(root);
    assert(709 == p.ElementsCount());
    assert(955 == p.TotalAttrCount());
    assert(str::Eq(root->name, "html"));
    HtmlElement *el = root->down;
    assert(str::Eq(el->name, "head"));
    el = el->next;
    assert(str::Eq(el->name, "body"));
    el = el->down;
    assert(str::Eq(el->name, "object"));
    el = el->next;
    assert(str::Eq(el->name, "ul"));
    el = el->down;
    assert(str::Eq(el->name, "li"));
    el = el->down;
    assert(str::Eq(el->name, "object"));
    ScopedMem<TCHAR> val(el->GetAttribute("type"));
    assert(str::Eq(val, _T("text/sitemap")));
    el = el->down;
    assert(str::Eq(el->name, "param"));
    assert(!el->down);
    assert(str::Eq(el->next->name, "param"));
    el = p.FindElementByName("body");
    assert(el);
    el = p.FindElementByName("ul", el);
    assert(el);
    int count = 0;
    while (el) {
        ++count;
        el = p.FindElementByName("ul", el);
    }
    assert(18 == count);
    free(d);
}
static void HtmlParser04()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<el att=  va&apos;l></ el >");
    utassert(1 == p.ElementsCount());
    utassert(1 == p.TotalAttrCount());
    utassert(root->NameIs("el"));
    utassert(NULL == root->next);
    utassert(NULL == root->up);
    utassert(NULL == root->down);
    ScopedMem<WCHAR> val(root->GetAttribute("att"));
    utassert(str::Eq(val, L"va'l"));
    utassert(!root->firstAttr->next);
}
static void HtmlParser09()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<?xml version='1.0'?><!-- <html><body></html> --><root attr='<!-- comment -->' />");
    assert(1 == p.ElementsCount());
    assert(1 == p.TotalAttrCount());
    assert(str::Eq("root", root->name));
    ScopedMem<TCHAR> val(root->GetAttribute("attr"));
    assert(str::Eq(val, _T("<!-- comment -->")));

    root = p.Parse("<!-- comment with \" and \' --><main />");
    assert(1 == p.ElementsCount());
    assert(0 == p.TotalAttrCount());
    assert(str::Eq("main", root->name));
}
Exemplo n.º 9
0
// ignores any <ul><li> list structure and just extracts a linear list of <object type="text/sitemap">...</object>
static bool WalkBrokenChmTocOrIndex(EbookTocVisitor* visitor, HtmlParser& p, UINT cp, bool isIndex) {
    bool hadOne = false;

    HtmlElement* el = p.FindElementByName("body");
    while ((el = p.FindElementByName("object", el)) != nullptr) {
        AutoFreeW type(el->GetAttribute("type"));
        if (!str::EqI(type, L"text/sitemap"))
            continue;
        if (isIndex)
            hadOne |= VisitChmIndexItem(visitor, el, cp, 1);
        else
            hadOne |= VisitChmTocItem(visitor, el, cp, 1);
    }

    return hadOne;
}
static void HtmlParser10()
{
    HtmlParser p;
    HtmlElement *root = p.Parse("<!xml version='1.0'?><x:a xmlns:x='http://example.org/ns/x'><x:b attr='val'/></x:a>");
    utassert(2 == p.ElementsCount());
    utassert(2 == p.TotalAttrCount());
    utassert(root->NameIs("x:a") && root->NameIsNS("a", "http://example.org/ns/x"));

    HtmlElement *node = p.FindElementByName("b");
    utassert(!node);
    node = p.FindElementByNameNS("b", "http://example.org/ns/x");
    utassert(node);
    utassert(node->NameIs("x:b") && node->NameIsNS("b", "http://example.org/ns/x"));
    ScopedMem<WCHAR> val(node->GetAttribute("attr"));
    utassert(str::Eq(val, L"val"));
    // TODO: XML tags are case sensitive (HTML tags aren't)
    node = p.FindElementByName("X:B");
    utassert(node && node->NameIs("X:B"));
}