DocTocItem *MobiEngineImpl::GetTocTree() { if (!tocReparsePoint) return NULL; EbookTocItem *root = NULL; ScopedMem<WCHAR> itemText; ScopedMem<WCHAR> itemLink; int itemLevel = 0; int idCounter = 0; // there doesn't seem to be a standard for Mobi ToCs, so we try to // determine the author's intentions by looking at commonly used tags HtmlPullParser parser(tocReparsePoint, str::Len(tocReparsePoint)); HtmlToken *tok; while ((tok = parser.Next()) && !tok->IsError()) { if (itemLink && tok->IsText()) { ScopedMem<WCHAR> linkText(str::conv::FromHtmlUtf8(tok->s, tok->sLen)); if (itemText) itemText.Set(str::Join(itemText, L" ", linkText)); else itemText.Set(linkText.StealData()); } else if (!tok->IsTag()) continue; else if (Tag_Mbp_Pagebreak == tok->tag) break; else if (!itemLink && tok->IsStartTag() && Tag_A == tok->tag) { AttrInfo *attr = tok->GetAttrByName("filepos"); if (!attr) attr = tok->GetAttrByName("href"); if (attr) itemLink.Set(str::conv::FromHtmlUtf8(attr->val, attr->valLen)); } else if (itemLink && tok->IsEndTag() && Tag_A == tok->tag) { PageDestination *dest = NULL; if (!itemText) { itemLink.Set(NULL); continue; } if (IsExternalUrl(itemLink)) dest = new SimpleDest2(0, RectD(), itemLink.StealData()); else dest = GetNamedDest(itemLink); EbookTocItem *item = new EbookTocItem(itemText.StealData(), dest); item->id = ++idCounter; item->open = itemLevel <= 2; AppendTocItem(root, item, itemLevel); itemLink.Set(NULL); } else if (Tag_Blockquote == tok->tag || Tag_Ul == tok->tag || Tag_Ol == tok->tag) { if (tok->IsStartTag()) itemLevel++; else if (tok->IsEndTag() && itemLevel > 0) itemLevel--; } } return root; }
static void Test02() { const char* s = "<p>Last paragraph"; HtmlPullParser parser(s, str::Len(s)); HtmlToken* t = parser.Next(); utassert(t && t->IsTag() && t->IsStartTag() && Tag_P == t->tag); t = parser.Next(); utassert(t && t->IsText() && str::EqNIx(t->s, t->sLen, "Last paragraph")); }
// Parse s in place i.e. we assume we can modify it. Must be 0-terminated. // The caller owns the memory for s. HtmlElement *HtmlParser::ParseInPlace(char *s, UINT codepage) { if (this->html) Reset(); this->html = s; this->codepage = codepage; HtmlPullParser parser(s, strlen(s)); HtmlToken *tok; while ((tok = parser.Next())) { char *tag = (char *)tok->s; if (tok->IsError()) { errorContext = tag; switch (tok->error) { case HtmlToken::UnclosedTag: return ParseError(ErrParsingElementName); case HtmlToken::InvalidTag: return ParseError(ErrParsingClosingElement); default: return ParseError(ErrParsingElement); } } if (!tok->IsTag()) { // ignore text content assert(tok->IsText()); continue; } char *tagEnd = tag + tok->nLen; if (!tok->IsEndTag()) { // note: call tok->NextAttr() before zero-terminating names and values AttrInfo *attr = tok->NextAttr(); *tagEnd = '\0'; StartTag(tag); while (attr) { char *name = (char *)attr->name; char *nameEnd = name + attr->nameLen; char *value = (char *)attr->val; char *valueEnd = value + attr->valLen; attr = tok->NextAttr(); *nameEnd = *valueEnd = '\0'; AppendAttr(name, value); } } if (!tok->IsStartTag() || IsTagSelfClosing(tok->tag)) { *tagEnd = '\0'; CloseTag(tag); } } return rootElement; }
// Return the next parsed page. Returns NULL if finished parsing. // For simplicity of implementation, we parse xml text node or // xml element at a time. This might cause a creation of one // or more pages, which we remeber and send to the caller // if we detect accumulated pages. HtmlPage *HtmlFormatter::Next(bool skipEmptyPages) { for (;;) { // send out all pages accumulated so far while (pagesToSend.Count() > 0) { HtmlPage *ret = pagesToSend.At(0); pagesToSend.RemoveAt(0); pageCount++; if (skipEmptyPages && IsEmptyPage(ret)) delete ret; else return ret; } // we can call ourselves recursively to send outstanding // pages after parsing has finished so this is to detect // that case and really end parsing if (finishedParsing) return NULL; HtmlToken *t = htmlParser->Next(); if (!t || t->IsError()) break; currReparseIdx = t->GetReparsePoint() - htmlParser->Start(); CrashIf(!ValidReparseIdx(currReparseIdx, htmlParser)); if (t->IsTag()) HandleHtmlTag(t); else if (!IgnoreText()) HandleText(t); } // force layout of the last line AutoCloseTags(tagNesting.Count()); FlushCurrLine(true); UpdateLinkBboxes(currPage); pagesToSend.Append(currPage); currPage = NULL; // call ourselves recursively to return accumulated pages finishedParsing = true; return Next(); }