// a text run is a string of consecutive text with uniform style void HtmlFormatter::EmitTextRun(const char* s, const char* end) { currReparseIdx = s - htmlParser->Start(); CrashIf(!ValidReparseIdx(currReparseIdx, htmlParser)); CrashIf(IsSpaceOnly(s, end) && !preFormatted); const char* tmp = ResolveHtmlEntities(s, end, textAllocator); bool resolved = tmp != s; if (resolved) { s = tmp; end = s + str::Len(s); } while (s < end) { // don't update the reparseIdx if s doesn't point into the original source if (!resolved) currReparseIdx = s - htmlParser->Start(); size_t strLen = str::Utf8ToWcharBuf(s, end - s, buf, dimof(buf)); // soft hyphens should not be displayed strLen -= str::RemoveChars(buf, L"\xad"); if (0 == strLen) break; textMeasure->SetFont(CurrFont()); RectF bbox = textMeasure->Measure(buf, strLen); EnsureDx(bbox.Width); if (bbox.Width <= pageDx - currX) { AppendInstr(DrawInstr::Str(s, end - s, bbox, dirRtl)); currX += bbox.Width; break; } size_t lenThatFits = StringLenForWidth(textMeasure, buf, strLen, pageDx - NewLineX()); // try to prevent a break in the middle of a word if (iswalnum(buf[lenThatFits])) { for (size_t len = lenThatFits; len > 0; len--) { if (!iswalnum(buf[len - 1])) { lenThatFits = len; break; } } } textMeasure->SetFont(CurrFont()); bbox = textMeasure->Measure(buf, lenThatFits); CrashIf(bbox.Width > pageDx); // s is UTF-8 and buf is UTF-16, so one // WCHAR doesn't always equal one char // TODO: this usually fails for non-BMP characters (i.e. hardly ever) for (size_t i = lenThatFits; i > 0; i--) { lenThatFits += buf[i - 1] < 0x80 ? 0 : buf[i - 1] < 0x800 ? 1 : 2; } AppendInstr(DrawInstr::Str(s, lenThatFits, bbox, dirRtl)); currX += bbox.Width; s += lenThatFits; } }
// Returns next part of html or NULL if finished HtmlToken *HtmlPullParser::Next() { if (currPos >= end) return NULL; Next: const char *start = currPos; if (*currPos != '<' || currPos + 1 < end && !IsValidTagStart(*++currPos)) { // this must be text between tags if (!SkipUntil(currPos, end, '<') && IsSpaceOnly(start, currPos)) { // ignore whitespace after the last tag return NULL; } currToken.SetText(start, currPos); return &currToken; } // '<' - tag begins ++start; // skip <? and <! (processing instructions and comments) if (start < end && (('?' == *start) || ('!' == *start))) { if ('!' == *start && start + 2 < end && str::StartsWith(start, "!--")) { currPos = start + 3; if (!SkipUntil(currPos, end, "-->")) { currToken.SetError(HtmlToken::UnclosedTag, start); return &currToken; } currPos += 2; } else if (!SkipUntil(currPos, end, '>')) { currToken.SetError(HtmlToken::UnclosedTag, start); return &currToken; } ++currPos; goto Next; } if (!SkipUntilTagEnd(currPos, end)) { currToken.SetError(HtmlToken::UnclosedTag, start); return &currToken; } CrashIf('>' != *currPos); if (currPos == start || currPos == start + 1 && *start == '/') { // skip empty tags (</>), because we're lenient ++currPos; goto Next; } if (('/' == *start) && ('/' == currPos[-1])) { // </foo/> currToken.SetError(HtmlToken::InvalidTag, start); } else if ('/' == *start) { // </foo> currToken.SetTag(HtmlToken::EndTag, start + 1, currPos); } else if ('/' == currPos[-1]) { // <foo/> currToken.SetTag(HtmlToken::EmptyElementTag, start, currPos - 1); } else { currToken.SetTag(HtmlToken::StartTag, start, currPos); } ++currPos; return &currToken; }