static void Test01() {
    utassert(IsInlineTag(Tag_A));
    utassert(IsInlineTag(Tag_U));
    utassert(IsInlineTag(Tag_Span));
    utassert(!IsInlineTag(Tag_P));
    utassert(IsTagSelfClosing(Tag_Area));
    utassert(IsTagSelfClosing(Tag_Link));
    utassert(IsTagSelfClosing(Tag_Param));
    utassert(!IsTagSelfClosing(Tag_P));
}
Beispiel #2
0
void HtmlFormatter::UpdateTagNesting(HtmlToken* t) {
    CrashIf(!t->IsTag());
    if (keepTagNesting || Tag_NotFound == t->tag || t->IsEmptyElementEndTag() || IsTagSelfClosing(t->tag)) {
        return;
    }

    size_t idx = tagNesting.size();
    bool isInline = IsInlineTag(t->tag);
    if (t->IsStartTag()) {
        if (IsInlineTag(t->tag)) {
            tagNesting.Push(t->tag);
            return;
        }
        // close all tags that can't contain this new block-level tag
        for (; idx > 0 && AutoCloseOnOpen(t->tag, tagNesting.at(idx - 1)); idx--)
            ;
    } else {
        // close all tags that were contained within the current tag
        // (for inline tags just up to the next block-level tag)
        for (; idx > 0 && (!isInline || IsInlineTag(tagNesting.at(idx - 1))) && t->tag != tagNesting.at(idx - 1); idx--)
            ;
        if (0 == idx || tagNesting.at(idx - 1) != t->tag)
            return;
    }

    AutoCloseTags(tagNesting.size() - idx);

    if (t->IsStartTag())
        tagNesting.Push(t->tag);
    else {
        CrashIf(!t->IsEndTag() || t->tag != tagNesting.Last());
        tagNesting.Pop();
    }
}
// Parse s in place i.e. we assume we can modify it. Must be 0-terminated.
// The caller owns the memory for s.
HtmlElement *HtmlParser::ParseInPlace(char *s, UINT codepage)
{
    if (this->html)
        Reset();
    this->html = s;
    this->codepage = codepage;

    HtmlPullParser parser(s, strlen(s));
    HtmlToken *tok;

    while ((tok = parser.Next())) {
        char *tag = (char *)tok->s;
        if (tok->IsError()) {
            errorContext = tag;
            switch (tok->error) {
                case HtmlToken::UnclosedTag: return ParseError(ErrParsingElementName);
                case HtmlToken::InvalidTag:  return ParseError(ErrParsingClosingElement);
                default:                     return ParseError(ErrParsingElement);
            }
        }
        if (!tok->IsTag()) {
            // ignore text content
            assert(tok->IsText());
            continue;
        }
        char *tagEnd = tag + tok->nLen;
        if (!tok->IsEndTag()) {
            // note: call tok->NextAttr() before zero-terminating names and values
            AttrInfo *attr = tok->NextAttr();
            *tagEnd = '\0';
            StartTag(tag);

            while (attr) {
                char *name = (char *)attr->name;
                char *nameEnd = name + attr->nameLen;
                char *value = (char *)attr->val;
                char *valueEnd = value + attr->valLen;
                attr = tok->NextAttr();

                *nameEnd = *valueEnd = '\0';
                AppendAttr(name, value);
            }
        }
        if (!tok->IsStartTag() || IsTagSelfClosing(tok->tag)) {
            *tagEnd = '\0';
            CloseTag(tag);
        }
    }

    return rootElement;
}
Beispiel #4
0
static WCHAR *ExtractHtmlText(EpubDoc *doc)
{
    size_t len;
    const char *data = doc->GetTextData(&len);

    str::Str<char> text(len / 2);
    HtmlPullParser p(data, len);
    HtmlToken *t;
    Vec<HtmlTag> tagNesting;
    while ((t = p.Next()) != NULL && !t->IsError()) {
        if (t->IsText() && !tagNesting.Contains(Tag_Head) && !tagNesting.Contains(Tag_Script) && !tagNesting.Contains(Tag_Style)) {
            // trim whitespace (TODO: also normalize within text?)
            while (t->sLen > 0 && str::IsWs(t->s[0])) {
                t->s++;
                t->sLen--;
            }
            while (t->sLen > 0 && str::IsWs(t->s[t->sLen-1]))
                t->sLen--;
            if (t->sLen > 0) {
                text.AppendAndFree(ResolveHtmlEntities(t->s, t->sLen));
                text.Append(' ');
            }
        }
        else if (t->IsStartTag()) {
            // TODO: force-close tags similar to HtmlFormatter.cpp's AutoCloseOnOpen?
            if (!IsTagSelfClosing(t->tag))
                tagNesting.Append(t->tag);
        }
        else if (t->IsEndTag()) {
            if (!IsInlineTag(t->tag) && text.Size() > 0 && text.Last() == ' ') {
                text.Pop();
                text.Append("\r\n");
            }
            // when closing a tag, if the top tag doesn't match but
            // there are only potentially self-closing tags on the
            // stack between the matching tag, we pop all of them
            if (tagNesting.Contains(t->tag)) {
                while (tagNesting.Last() != t->tag)
                    tagNesting.Pop();
            }
            if (tagNesting.Count() > 0 && tagNesting.Last() == t->tag)
                tagNesting.Pop();
        }
    }

    return str::conv::FromUtf8(text.Get());
}
Beispiel #5
0
// record the tag for the purpose of building current state
// of html tree
static void RecordStartTag(Vec<HtmlTag>* tagNesting, HtmlTag tag)
{
    if (!IsTagSelfClosing(tag))
        tagNesting->Append(tag);
}