Exemple #1
0
static char *GetTextContent(HtmlPullParser& parser)
{
    HtmlToken *tok = parser.Next();
    if (!tok || !tok->IsText())
        return NULL;
    return ResolveHtmlEntities(tok->s, tok->sLen);
}
DocTocItem *MobiEngineImpl::GetTocTree()
{
    if (!tocReparsePoint)
        return NULL;

    EbookTocItem *root = NULL;
    ScopedMem<WCHAR> itemText;
    ScopedMem<WCHAR> itemLink;
    int itemLevel = 0;
    int idCounter = 0;

    // there doesn't seem to be a standard for Mobi ToCs, so we try to
    // determine the author's intentions by looking at commonly used tags
    HtmlPullParser parser(tocReparsePoint, str::Len(tocReparsePoint));
    HtmlToken *tok;
    while ((tok = parser.Next()) && !tok->IsError()) {
        if (itemLink && tok->IsText()) {
            ScopedMem<WCHAR> linkText(str::conv::FromHtmlUtf8(tok->s, tok->sLen));
            if (itemText)
                itemText.Set(str::Join(itemText, L" ", linkText));
            else
                itemText.Set(linkText.StealData());
        }
        else if (!tok->IsTag())
            continue;
        else if (Tag_Mbp_Pagebreak == tok->tag)
            break;
        else if (!itemLink && tok->IsStartTag() && Tag_A == tok->tag) {
            AttrInfo *attr = tok->GetAttrByName("filepos");
            if (!attr)
                attr = tok->GetAttrByName("href");
            if (attr)
                itemLink.Set(str::conv::FromHtmlUtf8(attr->val, attr->valLen));
        }
        else if (itemLink && tok->IsEndTag() && Tag_A == tok->tag) {
            PageDestination *dest = NULL;
            if (!itemText) {
                itemLink.Set(NULL);
                continue;
            }
            if (IsExternalUrl(itemLink))
                dest = new SimpleDest2(0, RectD(), itemLink.StealData());
            else
                dest = GetNamedDest(itemLink);
            EbookTocItem *item = new EbookTocItem(itemText.StealData(), dest);
            item->id = ++idCounter;
            item->open = itemLevel <= 2;
            AppendTocItem(root, item, itemLevel);
            itemLink.Set(NULL);
        }
        else if (Tag_Blockquote == tok->tag || Tag_Ul == tok->tag || Tag_Ol == tok->tag) {
            if (tok->IsStartTag())
                itemLevel++;
            else if (tok->IsEndTag() && itemLevel > 0)
                itemLevel--;
        }
    }

    return root;
}
static void Test02() {
    const char* s = "<p>Last paragraph";
    HtmlPullParser parser(s, str::Len(s));
    HtmlToken* t = parser.Next();
    utassert(t && t->IsTag() && t->IsStartTag() && Tag_P == t->tag);
    t = parser.Next();
    utassert(t && t->IsText() && str::EqNIx(t->s, t->sLen, "Last paragraph"));
}
static void Test03() {
    const char* s = "a < b > c <> d <";
    HtmlPullParser parser(s, str::Len(s));
    HtmlToken* t = parser.Next();
    utassert(t && t->IsText() && str::EqNIx(t->s, t->sLen, "a "));
    t = parser.Next();
    utassert(t && t->IsText() && str::EqNIx(t->s, t->sLen, "< b > c "));
    t = parser.Next();
    utassert(t && t->IsText() && str::EqNIx(t->s, t->sLen, "<> d "));
    t = parser.Next();
    utassert(t && t->IsError() && HtmlToken::UnclosedTag == t->error);
    t = parser.Next();
    utassert(!t);
}
// the name doesn't quite fit: this handles FB2 tags
void Fb2Formatter::HandleHtmlTag(HtmlToken *t)
{
    if (Tag_Title == t->tag || Tag_Subtitle == t->tag) {
        bool isSubtitle = Tag_Subtitle == t->tag;
        ScopedMem<char> name(str::Format("h%d", section + (isSubtitle ? 1 : 0)));
        HtmlToken tok;
        tok.SetTag(t->type, name, name + str::Len(name));
        HandleTagHx(&tok);
        HandleAnchorAttr(t);
        if (!isSubtitle && t->IsStartTag()) {
            char *link = (char *)Allocator::Alloc(textAllocator, 24);
            sprintf_s(link, 24, FB2_TOC_ENTRY_MARK "%d", ++titleCount);
            currPage->instructions.Append(DrawInstr::Anchor(link, str::Len(link), RectF(0, currY, pageDx, 0)));
        }
    }
    else if (Tag_Section == t->tag) {
        if (t->IsStartTag())
            section++;
        else if (t->IsEndTag() && section > 1)
            section--;
        FlushCurrLine(true);
        HandleAnchorAttr(t);
    }
    else if (Tag_P == t->tag) {
        if (!tagNesting.Contains(Tag_Title))
            HtmlFormatter::HandleHtmlTag(t);
    }
    else if (Tag_Image == t->tag) {
        HandleTagImg(t);
        HandleAnchorAttr(t);
    }
    else if (Tag_A == t->tag) {
        HandleTagA(t, "href", "http://www.w3.org/1999/xlink");
        HandleAnchorAttr(t, true);
    }
    else if (Tag_Pagebreak == t->tag)
        ForceNewPage();
    else if (Tag_Strong == t->tag)
        HandleTagAsHtml(t, "b");
    else if (t->NameIs("emphasis"))
        HandleTagAsHtml(t, "i");
    else if (t->NameIs("epigraph"))
        HandleTagAsHtml(t, "blockquote");
    else if (t->NameIs("empty-line")) {
        if (!t->IsEndTag())
            EmitParagraph(0);
    }
    else if (t->NameIs("stylesheet"))
        HandleTagAsHtml(t, "style");
}
Exemple #6
0
static WCHAR *ExtractHtmlText(EpubDoc *doc)
{
    size_t len;
    const char *data = doc->GetTextData(&len);

    str::Str<char> text(len / 2);
    HtmlPullParser p(data, len);
    HtmlToken *t;
    Vec<HtmlTag> tagNesting;
    while ((t = p.Next()) != NULL && !t->IsError()) {
        if (t->IsText() && !tagNesting.Contains(Tag_Head) && !tagNesting.Contains(Tag_Script) && !tagNesting.Contains(Tag_Style)) {
            // trim whitespace (TODO: also normalize within text?)
            while (t->sLen > 0 && str::IsWs(t->s[0])) {
                t->s++;
                t->sLen--;
            }
            while (t->sLen > 0 && str::IsWs(t->s[t->sLen-1]))
                t->sLen--;
            if (t->sLen > 0) {
                text.AppendAndFree(ResolveHtmlEntities(t->s, t->sLen));
                text.Append(' ');
            }
        }
        else if (t->IsStartTag()) {
            // TODO: force-close tags similar to HtmlFormatter.cpp's AutoCloseOnOpen?
            if (!IsTagSelfClosing(t->tag))
                tagNesting.Append(t->tag);
        }
        else if (t->IsEndTag()) {
            if (!IsInlineTag(t->tag) && text.Size() > 0 && text.Last() == ' ') {
                text.Pop();
                text.Append("\r\n");
            }
            // when closing a tag, if the top tag doesn't match but
            // there are only potentially self-closing tags on the
            // stack between the matching tag, we pop all of them
            if (tagNesting.Contains(t->tag)) {
                while (tagNesting.Last() != t->tag)
                    tagNesting.Pop();
            }
            if (tagNesting.Count() > 0 && tagNesting.Last() == t->tag)
                tagNesting.Pop();
        }
    }

    return str::conv::FromUtf8(text.Get());
}
static void Test00(const char *s, HtmlToken::TokenType expectedType) {
    HtmlPullParser parser(s, str::Len(s));
    HtmlToken *t = parser.Next();
    assert(t->type == expectedType);
    assert(t->NameIs("p"));
    assert(Tag_P == t->tag);
    AttrInfo *a = t->GetAttrByName("a1");
    assert(a->NameIs("a1"));
    assert(a->ValIs(">"));

    a = t->GetAttrByName("foo");
    assert(a->NameIs("foo"));
    assert(a->ValIs("bar"));

    a = t->GetAttrByName("nope");
    assert(!a);

    t = parser.Next();
    assert(!t);
}
// Parse s in place i.e. we assume we can modify it. Must be 0-terminated.
// The caller owns the memory for s.
HtmlElement *HtmlParser::ParseInPlace(char *s, UINT codepage)
{
    if (this->html)
        Reset();
    this->html = s;
    this->codepage = codepage;

    HtmlPullParser parser(s, strlen(s));
    HtmlToken *tok;

    while ((tok = parser.Next())) {
        char *tag = (char *)tok->s;
        if (tok->IsError()) {
            errorContext = tag;
            switch (tok->error) {
                case HtmlToken::UnclosedTag: return ParseError(ErrParsingElementName);
                case HtmlToken::InvalidTag:  return ParseError(ErrParsingClosingElement);
                default:                     return ParseError(ErrParsingElement);
            }
        }
        if (!tok->IsTag()) {
            // ignore text content
            assert(tok->IsText());
            continue;
        }
        char *tagEnd = tag + tok->nLen;
        if (!tok->IsEndTag()) {
            // note: call tok->NextAttr() before zero-terminating names and values
            AttrInfo *attr = tok->NextAttr();
            *tagEnd = '\0';
            StartTag(tag);

            while (attr) {
                char *name = (char *)attr->name;
                char *nameEnd = name + attr->nameLen;
                char *value = (char *)attr->val;
                char *valueEnd = value + attr->valLen;
                attr = tok->NextAttr();

                *nameEnd = *valueEnd = '\0';
                AppendAttr(name, value);
            }
        }
        if (!tok->IsStartTag() || IsTagSelfClosing(tok->tag)) {
            *tagEnd = '\0';
            CloseTag(tag);
        }
    }

    return rootElement;
}
// Return the next parsed page. Returns NULL if finished parsing.
// For simplicity of implementation, we parse xml text node or
// xml element at a time. This might cause a creation of one
// or more pages, which we remeber and send to the caller
// if we detect accumulated pages.
HtmlPage *HtmlFormatter::Next(bool skipEmptyPages)
{
    for (;;)
    {
        // send out all pages accumulated so far
        while (pagesToSend.Count() > 0) {
            HtmlPage *ret = pagesToSend.At(0);
            pagesToSend.RemoveAt(0);
            pageCount++;
            if (skipEmptyPages && IsEmptyPage(ret))
                delete ret;
            else
                return ret;
        }
        // we can call ourselves recursively to send outstanding
        // pages after parsing has finished so this is to detect
        // that case and really end parsing
        if (finishedParsing)
            return NULL;
        HtmlToken *t = htmlParser->Next();
        if (!t || t->IsError())
            break;

        currReparseIdx = t->GetReparsePoint() - htmlParser->Start();
        CrashIf(!ValidReparseIdx(currReparseIdx, htmlParser));
        if (t->IsTag())
            HandleHtmlTag(t);
        else if (!IgnoreText())
            HandleText(t);
    }
    // force layout of the last line
    AutoCloseTags(tagNesting.Count());
    FlushCurrLine(true);

    UpdateLinkBboxes(currPage);
    pagesToSend.Append(currPage);
    currPage = NULL;
    // call ourselves recursively to return accumulated pages
    finishedParsing = true;
    return Next();
}
Exemple #10
0
// extract ComicInfo.xml metadata
// cf. http://comicrack.cyolito.com/downloads/comicrack/ComicRack/Support-Files/ComicInfoSchema.zip/
void CbxEngineImpl::ParseComicInfoXml(const char *xmlData)
{
    PoolAllocator allocator;
    HtmlPullParser parser(xmlData, str::Len(xmlData));
    HtmlToken *tok;
    while ((tok = parser.Next()) && !tok->IsError()) {
        if (!tok->IsStartTag())
            continue;
        if (tok->NameIs("Title")) {
            ScopedMem<char> value(GetTextContent(parser));
            if (value)
                Visit("/ComicBookInfo/1.0/title", value, json::Type_String);
        }
        else if (tok->NameIs("Year")) {
            ScopedMem<char> value(GetTextContent(parser));
            if (value)
                Visit("/ComicBookInfo/1.0/publicationYear", value, json::Type_Number);
        }
        else if (tok->NameIs("Month")) {
            ScopedMem<char> value(GetTextContent(parser));
            if (value)
                Visit("/ComicBookInfo/1.0/publicationMonth", value, json::Type_Number);
        }
        else if (tok->NameIs("Summary")) {
            ScopedMem<char> value(GetTextContent(parser));
            if (value)
                Visit("/X-summary", value, json::Type_String);
        }
        else if (tok->NameIs("Writer")) {
            ScopedMem<char> value(GetTextContent(parser));
            if (value) {
                Visit("/ComicBookInfo/1.0/credits[0]/person", value, json::Type_String);
                Visit("/ComicBookInfo/1.0/credits[0]/primary", "true", json::Type_Bool);
            }
        }
        else if (tok->NameIs("Penciller")) {
            ScopedMem<char> value(GetTextContent(parser));
            if (value) {
                Visit("/ComicBookInfo/1.0/credits[1]/person", value, json::Type_String);
                Visit("/ComicBookInfo/1.0/credits[1]/primary", "true", json::Type_Bool);
            }
        }
    }
}
// cf. http://www.w3.org/TR/html4/charset.html#h-5.2.2
static UINT ExtractHttpCharset(const char *html, size_t htmlLen)
{
    if (!strstr(html, "charset="))
        return 0;

    HtmlPullParser parser(html, min(htmlLen, 1024));
    HtmlToken *tok;
    while ((tok = parser.Next()) && !tok->IsError()) {
        if (tok->tag != Tag_Meta)
            continue;
        AttrInfo *attr = tok->GetAttrByName("http-equiv");
        if (!attr || !attr->ValIs("Content-Type"))
            continue;
        attr = tok->GetAttrByName("content");
        ScopedMem<char> mimetype, charset;
        if (!attr || !str::Parse(attr->val, attr->valLen, "%S;%_charset=%S", &mimetype, &charset))
            continue;

        static struct {
            const char *name;
            UINT codepage;
        } codepages[] = {
            { "ISO-8859-1", 1252 }, { "Latin1", 1252 }, { "CP1252", 1252 }, { "Windows-1252", 1252 },
            { "ISO-8859-2", 28592 }, { "Latin2", 28592 },
            { "CP1251", 1251 }, { "Windows-1251", 1251 }, { "KOI8-R", 20866 },
            { "shift-jis", 932 }, { "x-euc", 932 }, { "euc-kr", 949 },
            { "Big5", 950 }, { "GB2312", 936 },
            { "UTF-8", CP_UTF8 },
        };
        for (int i = 0; i < dimof(codepages); i++) {
            if (str::EqI(charset, codepages[i].name))
                return codepages[i].codepage;
        }
        break;
    }
    
    return 0;
}
DocTocItem *Fb2EngineImpl::GetTocTree()
{
    EbookTocItem *root = NULL;
    ScopedMem<WCHAR> itemText;
    int titleCount = 0;
    bool inTitle = false;
    int level = 0;

    size_t xmlLen;
    const char *xmlData = doc->GetTextData(&xmlLen);
    HtmlPullParser parser(xmlData, xmlLen);
    HtmlToken *tok;
    while ((tok = parser.Next()) && !tok->IsError()) {
        if (tok->IsStartTag() && Tag_Section == tok->tag)
            level++;
        else if (tok->IsEndTag() && Tag_Section == tok->tag && level > 0)
            level--;
        else if (tok->IsStartTag() && Tag_Title == tok->tag) {
            inTitle = true;
            titleCount++;
        }
        else if (tok->IsEndTag() && Tag_Title == tok->tag) {
            if (itemText)
                str::NormalizeWS(itemText);
            if (!str::IsEmpty(itemText.Get())) {
                ScopedMem<WCHAR> name(str::Format(TEXT(FB2_TOC_ENTRY_MARK) L"%d", titleCount));
                PageDestination *dest = GetNamedDest(name);
                EbookTocItem *item = new EbookTocItem(itemText.StealData(), dest);
                item->id = titleCount;
                item->open = level <= 2;
                AppendTocItem(root, item, level);
            }
            inTitle = false;
        }
        else if (inTitle && tok->IsText()) {
            ScopedMem<WCHAR> text(str::conv::FromHtmlUtf8(tok->s, tok->sLen));
            if (str::IsEmpty(itemText.Get()))
                itemText.Set(text.StealData());
            else
                itemText.Set(str::Join(itemText, L" ", text));
        }
    }

    return root;
}
void EbookController::ExtractPageAnchors()
{
    if (pageAnchorIds || pageAnchorIdxs) {
        CrashIf(!pageAnchorIds || !pageAnchorIdxs);
        return;
    }

    pageAnchorIds = new WStrVec();
    pageAnchorIdxs = new Vec<int>();

    ScopedMem<WCHAR> epubPagePath;
    int fb2TitleCount = 0;
    size_t len;
    const char *data = doc.GetHtmlData(len);
    HtmlPullParser parser(data, len);
    HtmlToken *tok;
    while ((tok = parser.Next()) != nullptr && !tok->IsError()) {
        if (!tok->IsStartTag() && !tok->IsEmptyElementEndTag())
            continue;
        AttrInfo *attr = tok->GetAttrByName("id");
        if (!attr && Tag_A == tok->tag && doc.Type() != Doc_Fb2)
            attr = tok->GetAttrByName("name");
        if (attr) {
            ScopedMem<WCHAR> id(str::conv::FromUtf8(attr->val, attr->valLen));
            pageAnchorIds->Append(str::Format(L"%s#%s", epubPagePath ? epubPagePath : L"", id.Get()));
            pageAnchorIdxs->Append((int)(tok->GetReparsePoint() - parser.Start()));
        }
        // update EPUB page paths and create an anchor per chapter
        if (Tag_Pagebreak == tok->tag &&
            (attr = tok->GetAttrByName("page_path")) != nullptr &&
            str::StartsWith(attr->val + attr->valLen, "\" page_marker />")) {
            CrashIf(doc.Type() != Doc_Epub);
            epubPagePath.Set(str::conv::FromUtf8(attr->val, attr->valLen));
            pageAnchorIds->Append(str::Dup(epubPagePath));
            pageAnchorIdxs->Append((int)(tok->GetReparsePoint() - parser.Start()));
        }
        // create FB2 title anchors (cf. Fb2Doc::ParseToc)
        if (Tag_Title == tok->tag && tok->IsStartTag() && Doc_Fb2 == doc.Type()) {
            ScopedMem<WCHAR> id(str::Format(TEXT(FB2_TOC_ENTRY_MARK) L"%d", ++fb2TitleCount));
            pageAnchorIds->Append(id.StealData());
            pageAnchorIdxs->Append((int)(tok->GetReparsePoint() - parser.Start()));
        }
    }
}
void Fb2Formatter::HandleTagAsHtml(HtmlToken *t, const char *name)
{
    HtmlToken tok;
    tok.SetTag(t->type, name, name + str::Len(name));
    HtmlFormatter::HandleHtmlTag(&tok);
}