DocTocItem *MobiEngineImpl::GetTocTree()
{
    if (!tocReparsePoint)
        return NULL;

    EbookTocItem *root = NULL;
    ScopedMem<WCHAR> itemText;
    ScopedMem<WCHAR> itemLink;
    int itemLevel = 0;
    int idCounter = 0;

    // there doesn't seem to be a standard for Mobi ToCs, so we try to
    // determine the author's intentions by looking at commonly used tags
    HtmlPullParser parser(tocReparsePoint, str::Len(tocReparsePoint));
    HtmlToken *tok;
    while ((tok = parser.Next()) && !tok->IsError()) {
        if (itemLink && tok->IsText()) {
            ScopedMem<WCHAR> linkText(str::conv::FromHtmlUtf8(tok->s, tok->sLen));
            if (itemText)
                itemText.Set(str::Join(itemText, L" ", linkText));
            else
                itemText.Set(linkText.StealData());
        }
        else if (!tok->IsTag())
            continue;
        else if (Tag_Mbp_Pagebreak == tok->tag)
            break;
        else if (!itemLink && tok->IsStartTag() && Tag_A == tok->tag) {
            AttrInfo *attr = tok->GetAttrByName("filepos");
            if (!attr)
                attr = tok->GetAttrByName("href");
            if (attr)
                itemLink.Set(str::conv::FromHtmlUtf8(attr->val, attr->valLen));
        }
        else if (itemLink && tok->IsEndTag() && Tag_A == tok->tag) {
            PageDestination *dest = NULL;
            if (!itemText) {
                itemLink.Set(NULL);
                continue;
            }
            if (IsExternalUrl(itemLink))
                dest = new SimpleDest2(0, RectD(), itemLink.StealData());
            else
                dest = GetNamedDest(itemLink);
            EbookTocItem *item = new EbookTocItem(itemText.StealData(), dest);
            item->id = ++idCounter;
            item->open = itemLevel <= 2;
            AppendTocItem(root, item, itemLevel);
            itemLink.Set(NULL);
        }
        else if (Tag_Blockquote == tok->tag || Tag_Ul == tok->tag || Tag_Ol == tok->tag) {
            if (tok->IsStartTag())
                itemLevel++;
            else if (tok->IsEndTag() && itemLevel > 0)
                itemLevel--;
        }
    }

    return root;
}
Esempio n. 2
0
void EbookController::ExtractPageAnchors()
{
    if (pageAnchorIds || pageAnchorIdxs) {
        CrashIf(!pageAnchorIds || !pageAnchorIdxs);
        return;
    }

    pageAnchorIds = new WStrVec();
    pageAnchorIdxs = new Vec<int>();

    ScopedMem<WCHAR> epubPagePath;
    int fb2TitleCount = 0;
    size_t len;
    const char *data = doc.GetHtmlData(len);
    HtmlPullParser parser(data, len);
    HtmlToken *tok;
    while ((tok = parser.Next()) != nullptr && !tok->IsError()) {
        if (!tok->IsStartTag() && !tok->IsEmptyElementEndTag())
            continue;
        AttrInfo *attr = tok->GetAttrByName("id");
        if (!attr && Tag_A == tok->tag && doc.Type() != Doc_Fb2)
            attr = tok->GetAttrByName("name");
        if (attr) {
            ScopedMem<WCHAR> id(str::conv::FromUtf8(attr->val, attr->valLen));
            pageAnchorIds->Append(str::Format(L"%s#%s", epubPagePath ? epubPagePath : L"", id.Get()));
            pageAnchorIdxs->Append((int)(tok->GetReparsePoint() - parser.Start()));
        }
        // update EPUB page paths and create an anchor per chapter
        if (Tag_Pagebreak == tok->tag &&
            (attr = tok->GetAttrByName("page_path")) != nullptr &&
            str::StartsWith(attr->val + attr->valLen, "\" page_marker />")) {
            CrashIf(doc.Type() != Doc_Epub);
            epubPagePath.Set(str::conv::FromUtf8(attr->val, attr->valLen));
            pageAnchorIds->Append(str::Dup(epubPagePath));
            pageAnchorIdxs->Append((int)(tok->GetReparsePoint() - parser.Start()));
        }
        // create FB2 title anchors (cf. Fb2Doc::ParseToc)
        if (Tag_Title == tok->tag && tok->IsStartTag() && Doc_Fb2 == doc.Type()) {
            ScopedMem<WCHAR> id(str::Format(TEXT(FB2_TOC_ENTRY_MARK) L"%d", ++fb2TitleCount));
            pageAnchorIds->Append(id.StealData());
            pageAnchorIdxs->Append((int)(tok->GetReparsePoint() - parser.Start()));
        }
    }
}
Esempio n. 3
0
static void Test00(const char *s, HtmlToken::TokenType expectedType) {
    HtmlPullParser parser(s, str::Len(s));
    HtmlToken *t = parser.Next();
    assert(t->type == expectedType);
    assert(t->NameIs("p"));
    assert(Tag_P == t->tag);
    AttrInfo *a = t->GetAttrByName("a1");
    assert(a->NameIs("a1"));
    assert(a->ValIs(">"));

    a = t->GetAttrByName("foo");
    assert(a->NameIs("foo"));
    assert(a->ValIs("bar"));

    a = t->GetAttrByName("nope");
    assert(!a);

    t = parser.Next();
    assert(!t);
}
// cf. http://www.w3.org/TR/html4/charset.html#h-5.2.2
static UINT ExtractHttpCharset(const char *html, size_t htmlLen)
{
    if (!strstr(html, "charset="))
        return 0;

    HtmlPullParser parser(html, min(htmlLen, 1024));
    HtmlToken *tok;
    while ((tok = parser.Next()) && !tok->IsError()) {
        if (tok->tag != Tag_Meta)
            continue;
        AttrInfo *attr = tok->GetAttrByName("http-equiv");
        if (!attr || !attr->ValIs("Content-Type"))
            continue;
        attr = tok->GetAttrByName("content");
        ScopedMem<char> mimetype, charset;
        if (!attr || !str::Parse(attr->val, attr->valLen, "%S;%_charset=%S", &mimetype, &charset))
            continue;

        static struct {
            const char *name;
            UINT codepage;
        } codepages[] = {
            { "ISO-8859-1", 1252 }, { "Latin1", 1252 }, { "CP1252", 1252 }, { "Windows-1252", 1252 },
            { "ISO-8859-2", 28592 }, { "Latin2", 28592 },
            { "CP1251", 1251 }, { "Windows-1251", 1251 }, { "KOI8-R", 20866 },
            { "shift-jis", 932 }, { "x-euc", 932 }, { "euc-kr", 949 },
            { "Big5", 950 }, { "GB2312", 936 },
            { "UTF-8", CP_UTF8 },
        };
        for (int i = 0; i < dimof(codepages); i++) {
            if (str::EqI(charset, codepages[i].name))
                return codepages[i].codepage;
        }
        break;
    }
    
    return 0;
}