Esempio n. 1
0
static void HtmlEntities()
{
    struct {
        const char *s; int rune;
    } entities[] = {
        { "Ü", 220 },
        { "ü", 252 },
        { "×", 215 },
        { "Æ", 198 },
        { "‌", 8204 },
        { ":", 58 },
        { "耏", 32783 },
        { " ", 32 },
        { "꼴", 44852 },
        { "Ä", 196 },
        { "&a3;", -1 },
        { "&#xz312;", -1 },
        { "&aer;", -1 }
    };
    for (size_t i = 0; i < dimof(entities); i++ ) {
        const char *s = entities[i].s;
        int got;
        const char *entEnd = ResolveHtmlEntity(s + 1, str::Len(s) - 1, got);
        assert(got == entities[i].rune);
        assert((-1 == got) == !entEnd);
    }
    const char *unchanged[] = {
        "foo", "", " as;d "
    };
    for (size_t i = 0; i < dimof(unchanged); i++) {
        const char *s = unchanged[i];
        const char *res = ResolveHtmlEntities(s, s + str::Len(s), NULL);
        assert(res == s);
    }

    struct {
        const char *s; const char *res;
    } changed[] = {
        // implementation detail: if there is '&' in the string
        // we always allocate, even if it isn't a valid entity
        { "a&12", "a&12" },
        { "a&x#30", "a&x#30" },

        { "&#32;b", " b" },
        { "&#x20;ra", " ra" },
        { "&lt;", "<" },
        { "a&amp; &#32;to&#x20;end", "a&  to end" },
        { "&nbsp test&auml ;&ouml;&#64&#x50go", "\xC2\xA0 test\xC3\xA4 ;\xC3\xB6@Pgo" },
    };
    for (size_t i = 0; i < dimof(changed); i++) {
        const char *s = changed[i].s;
        const char *res = ResolveHtmlEntities(s, s + str::Len(s), NULL);
        assert(str::Eq(res, changed[i].res));
        free((void*)res);
    }
}
Esempio n. 2
0
// convenience function for the above that always allocates
char *ResolveHtmlEntities(const char *s, size_t len)
{
    const char *tmp = ResolveHtmlEntities(s, s + len, NULL);
    if (tmp == s)
        return str::DupN(s, len);
    return (char *)tmp;
}
Esempio n. 3
0
// a text run is a string of consecutive text with uniform style
void HtmlFormatter::EmitTextRun(const char* s, const char* end) {
    currReparseIdx = s - htmlParser->Start();
    CrashIf(!ValidReparseIdx(currReparseIdx, htmlParser));
    CrashIf(IsSpaceOnly(s, end) && !preFormatted);
    const char* tmp = ResolveHtmlEntities(s, end, textAllocator);
    bool resolved = tmp != s;
    if (resolved) {
        s = tmp;
        end = s + str::Len(s);
    }

    while (s < end) {
        // don't update the reparseIdx if s doesn't point into the original source
        if (!resolved)
            currReparseIdx = s - htmlParser->Start();

        size_t strLen = str::Utf8ToWcharBuf(s, end - s, buf, dimof(buf));
        // soft hyphens should not be displayed
        strLen -= str::RemoveChars(buf, L"\xad");
        if (0 == strLen)
            break;
        textMeasure->SetFont(CurrFont());
        RectF bbox = textMeasure->Measure(buf, strLen);
        EnsureDx(bbox.Width);
        if (bbox.Width <= pageDx - currX) {
            AppendInstr(DrawInstr::Str(s, end - s, bbox, dirRtl));
            currX += bbox.Width;
            break;
        }

        size_t lenThatFits = StringLenForWidth(textMeasure, buf, strLen, pageDx - NewLineX());
        // try to prevent a break in the middle of a word
        if (iswalnum(buf[lenThatFits])) {
            for (size_t len = lenThatFits; len > 0; len--) {
                if (!iswalnum(buf[len - 1])) {
                    lenThatFits = len;
                    break;
                }
            }
        }
        textMeasure->SetFont(CurrFont());
        bbox = textMeasure->Measure(buf, lenThatFits);
        CrashIf(bbox.Width > pageDx);
        // s is UTF-8 and buf is UTF-16, so one
        // WCHAR doesn't always equal one char
        // TODO: this usually fails for non-BMP characters (i.e. hardly ever)
        for (size_t i = lenThatFits; i > 0; i--) {
            lenThatFits += buf[i - 1] < 0x80 ? 0 : buf[i - 1] < 0x800 ? 1 : 2;
        }
        AppendInstr(DrawInstr::Str(s, lenThatFits, bbox, dirRtl));
        currX += bbox.Width;
        s += lenThatFits;
    }
}
Esempio n. 4
0
static WCHAR *ExtractHtmlText(EpubDoc *doc)
{
    size_t len;
    const char *data = doc->GetTextData(&len);

    str::Str<char> text(len / 2);
    HtmlPullParser p(data, len);
    HtmlToken *t;
    Vec<HtmlTag> tagNesting;
    while ((t = p.Next()) != NULL && !t->IsError()) {
        if (t->IsText() && !tagNesting.Contains(Tag_Head) && !tagNesting.Contains(Tag_Script) && !tagNesting.Contains(Tag_Style)) {
            // trim whitespace (TODO: also normalize within text?)
            while (t->sLen > 0 && str::IsWs(t->s[0])) {
                t->s++;
                t->sLen--;
            }
            while (t->sLen > 0 && str::IsWs(t->s[t->sLen-1]))
                t->sLen--;
            if (t->sLen > 0) {
                text.AppendAndFree(ResolveHtmlEntities(t->s, t->sLen));
                text.Append(' ');
            }
        }
        else if (t->IsStartTag()) {
            // TODO: force-close tags similar to HtmlFormatter.cpp's AutoCloseOnOpen?
            if (!IsTagSelfClosing(t->tag))
                tagNesting.Append(t->tag);
        }
        else if (t->IsEndTag()) {
            if (!IsInlineTag(t->tag) && text.Size() > 0 && text.Last() == ' ') {
                text.Pop();
                text.Append("\r\n");
            }
            // when closing a tag, if the top tag doesn't match but
            // there are only potentially self-closing tags on the
            // stack between the matching tag, we pop all of them
            if (tagNesting.Contains(t->tag)) {
                while (tagNesting.Last() != t->tag)
                    tagNesting.Pop();
            }
            if (tagNesting.Count() > 0 && tagNesting.Last() == t->tag)
                tagNesting.Pop();
        }
    }

    return str::conv::FromUtf8(text.Get());
}
Esempio n. 5
0
void EbookController::OnClickedLink(int pageNo, DrawInstr *link)
{
    ScopedMem<WCHAR> url(str::conv::FromHtmlUtf8(link->str.s, link->str.len));
    if (url::IsAbsolute(url)) {
        EbookTocDest dest(nullptr, url);
        cb->GotoLink(&dest);
        return;
    }

    if (Doc_Epub == doc.Type() && pages && (size_t)pageNo <= pages->Count()) {
        // normalize the URL by combining it with the chapter's base path
        for (int j = pageNo; j > 0; j--) {
            HtmlPage *p = pages->At(j - 1);
            // <pagebreak src="..." page_marker /> is usually the second instruction on a page
            for (size_t k = 0; k < std::min((size_t)2, p->instructions.Count()); k++) {
                DrawInstr& di = p->instructions.At(k);
                if (InstrAnchor == di.type && str::StartsWith(di.str.s + di.str.len, "\" page_marker />")) {
                    ScopedMem<char> basePath(str::DupN(di.str.s, di.str.len));
                    ScopedMem<char> relPath(ResolveHtmlEntities(link->str.s, link->str.len));
                    ScopedMem<char> absPath(NormalizeURL(relPath, basePath));
                    url.Set(str::conv::FromUtf8(absPath));
                    j = 0; // done
                    break;
                }
            }
        }
    }

    int idx = ResolvePageAnchor(url);
    if (-1 == idx && str::FindChar(url, '%')) {
        url::DecodeInPlace(url);
        idx = ResolvePageAnchor(url);
    }
    if (idx != -1) {
        EbookTocDest dest(nullptr, idx);
        cb->GotoLink(&dest);
    }
}