static void HtmlParser02() { HtmlParser p; HtmlElement *root = p.Parse("<a><b/><c></c ><d at1=\"<quo&ted>\" at2='also quoted' att3=notquoted att4=end/></a>"); assert(4 == p.ElementsCount()); assert(4 == p.TotalAttrCount()); assert(str::Eq("a", root->name)); assert(NULL == root->next); HtmlElement *el = root->down; assert(str::Eq("b", el->name)); assert(root == el->up); el = el->next; assert(str::Eq("c", el->name)); assert(root == el->up); el = el->next; assert(str::Eq("d", el->name)); assert(NULL == el->next); assert(root == el->up); ScopedMem<TCHAR> val(el->GetAttribute("at1")); assert(str::Eq(val, _T("<quo&ted>"))); val.Set(el->GetAttribute("at2")); assert(str::Eq(val, _T("also quoted"))); val.Set(el->GetAttribute("att3")); assert(str::Eq(val, _T("notquoted"))); val.Set(el->GetAttribute("att4")); assert(str::Eq(val, _T("end"))); }
static void HtmlParser07() { HtmlParser p; HtmlElement *root = p.Parse("<test umls=ä\xC3\xB6ü Zero=�&#-1;>", CP_UTF8); utassert(1 == p.ElementsCount()); ScopedMem<WCHAR> val(root->GetAttribute("umls")); utassert(str::Eq(val, L"\xE4\xF6\xFC")); val.Set(root->GetAttribute("zerO")); utassert(str::Eq(val, L"\x01??")); }
static void HtmlParser07() { HtmlParser p; HtmlElement *root = p.Parse("<test umls=ä\xC3\xB6ü zero=�&#-1;>", CP_UTF8); assert(1 == p.ElementsCount()); ScopedMem<TCHAR> val(root->GetAttribute("umls")); #ifdef UNICODE assert(str::Eq(val, L"\xE4\xF6\xFC")); #else assert(str::EndsWith(val, "\xFC")); #endif val.Set(root->GetAttribute("zero")); assert(str::Eq(val, _T("\x01??"))); }
bool MobiEngineImpl::Load(const WCHAR *fileName) { this->fileName = str::Dup(fileName); doc = MobiDoc::CreateFromFile(fileName); if (!doc || Pdb_Mobipocket != doc->GetDocType()) return false; HtmlFormatterArgs args; args.htmlStr = doc->GetBookHtmlData(args.htmlStrLen); args.pageDx = (float)pageRect.dx - 2 * pageBorder; args.pageDy = (float)pageRect.dy - 2 * pageBorder; args.fontName = DEFAULT_FONT_NAME; args.fontSize = DEFAULT_FONT_SIZE; args.textAllocator = &allocator; args.measureAlgo = MeasureTextQuick; pages = MobiFormatter(&args, doc).FormatAllPages(); if (!ExtractPageAnchors()) return false; HtmlParser parser; if (parser.Parse(args.htmlStr)) { HtmlElement *ref = NULL; while ((ref = parser.FindElementByName("reference", ref))) { ScopedMem<WCHAR> type(ref->GetAttribute("type")); ScopedMem<WCHAR> filepos(ref->GetAttribute("filepos")); if (str::EqI(type, L"toc") && filepos) { unsigned int pos; if (str::Parse(filepos, L"%u%$", &pos) && pos < args.htmlStrLen) { tocReparsePoint = args.htmlStr + pos; break; } } } } return pages->Count() > 0; }
static void HtmlParser03() { HtmlParser p; HtmlElement *root = p.Parse("<el att =v"al/>"); assert(1 == p.ElementsCount()); assert(1 == p.TotalAttrCount()); assert(str::Eq("el", root->name)); assert(NULL == root->next); assert(NULL == root->up); assert(NULL == root->down); ScopedMem<TCHAR> val(root->GetAttribute("att")); assert(str::Eq(val, _T("v\"al"))); assert(!root->firstAttr->next); }
static void HtmlParserFile() { TCHAR *fileName = _T("HtmlParseTest00.html"); // We assume we're being run from obj-[dbg|rel], so the test // files are in ..\src\utils directory relative to exe's dir ScopedMem<TCHAR> exePath(GetExePath()); const TCHAR *exeDir = path::GetBaseName(exePath); ScopedMem<TCHAR> p1(path::Join(exeDir, _T("..\\src\\utils"))); ScopedMem<TCHAR> p2(path::Join(p1, fileName)); char *d = file::ReadAll(p2, NULL); // it's ok if we fail - we assume we were not run from the // right location if (!d) return; HtmlParser p; HtmlElement *root = p.ParseInPlace(d); assert(root); assert(709 == p.ElementsCount()); assert(955 == p.TotalAttrCount()); assert(str::Eq(root->name, "html")); HtmlElement *el = root->down; assert(str::Eq(el->name, "head")); el = el->next; assert(str::Eq(el->name, "body")); el = el->down; assert(str::Eq(el->name, "object")); el = el->next; assert(str::Eq(el->name, "ul")); el = el->down; assert(str::Eq(el->name, "li")); el = el->down; assert(str::Eq(el->name, "object")); ScopedMem<TCHAR> val(el->GetAttribute("type")); assert(str::Eq(val, _T("text/sitemap"))); el = el->down; assert(str::Eq(el->name, "param")); assert(!el->down); assert(str::Eq(el->next->name, "param")); el = p.FindElementByName("body"); assert(el); el = p.FindElementByName("ul", el); assert(el); int count = 0; while (el) { ++count; el = p.FindElementByName("ul", el); } assert(18 == count); free(d); }
static void HtmlParser04() { HtmlParser p; HtmlElement *root = p.Parse("<el att= va'l></ el >"); utassert(1 == p.ElementsCount()); utassert(1 == p.TotalAttrCount()); utassert(root->NameIs("el")); utassert(NULL == root->next); utassert(NULL == root->up); utassert(NULL == root->down); ScopedMem<WCHAR> val(root->GetAttribute("att")); utassert(str::Eq(val, L"va'l")); utassert(!root->firstAttr->next); }
static void HtmlParser09() { HtmlParser p; HtmlElement *root = p.Parse("<?xml version='1.0'?><!-- <html><body></html> --><root attr='<!-- comment -->' />"); assert(1 == p.ElementsCount()); assert(1 == p.TotalAttrCount()); assert(str::Eq("root", root->name)); ScopedMem<TCHAR> val(root->GetAttribute("attr")); assert(str::Eq(val, _T("<!-- comment -->"))); root = p.Parse("<!-- comment with \" and \' --><main />"); assert(1 == p.ElementsCount()); assert(0 == p.TotalAttrCount()); assert(str::Eq("main", root->name)); }
// ignores any <ul><li> list structure and just extracts a linear list of <object type="text/sitemap">...</object> static bool WalkBrokenChmTocOrIndex(EbookTocVisitor* visitor, HtmlParser& p, UINT cp, bool isIndex) { bool hadOne = false; HtmlElement* el = p.FindElementByName("body"); while ((el = p.FindElementByName("object", el)) != nullptr) { AutoFreeW type(el->GetAttribute("type")); if (!str::EqI(type, L"text/sitemap")) continue; if (isIndex) hadOne |= VisitChmIndexItem(visitor, el, cp, 1); else hadOne |= VisitChmTocItem(visitor, el, cp, 1); } return hadOne; }
static void HtmlParser10() { HtmlParser p; HtmlElement *root = p.Parse("<!xml version='1.0'?><x:a xmlns:x='http://example.org/ns/x'><x:b attr='val'/></x:a>"); utassert(2 == p.ElementsCount()); utassert(2 == p.TotalAttrCount()); utassert(root->NameIs("x:a") && root->NameIsNS("a", "http://example.org/ns/x")); HtmlElement *node = p.FindElementByName("b"); utassert(!node); node = p.FindElementByNameNS("b", "http://example.org/ns/x"); utassert(node); utassert(node->NameIs("x:b") && node->NameIsNS("b", "http://example.org/ns/x")); ScopedMem<WCHAR> val(node->GetAttribute("attr")); utassert(str::Eq(val, L"val")); // TODO: XML tags are case sensitive (HTML tags aren't) node = p.FindElementByName("X:B"); utassert(node && node->NameIs("X:B")); }