FPDFText_MarkedContent CPDF_TextPage::PreMarkedContent(PDFTEXT_Obj Obj) { CPDF_TextObject* pTextObj = Obj.m_pTextObj.Get(); size_t nContentMarks = pTextObj->m_ContentMarks.CountItems(); if (nContentMarks == 0) return FPDFText_MarkedContent::Pass; WideString actText; bool bExist = false; const CPDF_Dictionary* pDict = nullptr; for (size_t i = 0; i < nContentMarks; ++i) { const CPDF_ContentMarkItem* item = pTextObj->m_ContentMarks.GetItem(i); pDict = item->GetParam(); if (!pDict) continue; const CPDF_String* temp = ToString(pDict->GetObjectFor("ActualText")); if (temp) { bExist = true; actText = temp->GetUnicodeText(); } } if (!bExist) return FPDFText_MarkedContent::Pass; if (m_pPreTextObj) { const CPDF_ContentMarks& marks = m_pPreTextObj->m_ContentMarks; if (marks.CountItems() == nContentMarks && marks.GetItem(nContentMarks - 1)->GetParam() == pDict) { return FPDFText_MarkedContent::Done; } } if (actText.IsEmpty()) return FPDFText_MarkedContent::Pass; CPDF_Font* pFont = pTextObj->GetFont(); bExist = false; for (size_t i = 0; i < actText.GetLength(); ++i) { if (pFont->CharCodeFromUnicode(actText[i]) != CPDF_Font::kInvalidCharCode) { bExist = true; break; } } if (!bExist) return FPDFText_MarkedContent::Pass; bExist = false; for (size_t i = 0; i < actText.GetLength(); ++i) { wchar_t wChar = actText[i]; if ((wChar > 0x80 && wChar < 0xFFFD) || (wChar <= 0x80 && isprint(wChar))) { bExist = true; break; } } if (!bExist) return FPDFText_MarkedContent::Done; return FPDFText_MarkedContent::Delay; }
void CPDF_TextPage::ProcessMarkedContent(PDFTEXT_Obj Obj) { CPDF_TextObject* pTextObj = Obj.m_pTextObj.Get(); size_t nContentMarks = pTextObj->m_ContentMarks.CountItems(); if (nContentMarks == 0) return; WideString actText; for (size_t n = 0; n < nContentMarks; ++n) { const CPDF_ContentMarkItem* item = pTextObj->m_ContentMarks.GetItem(n); const CPDF_Dictionary* pDict = item->GetParam(); if (pDict) actText = pDict->GetUnicodeTextFor("ActualText"); } if (actText.IsEmpty()) return; CPDF_Font* pFont = pTextObj->GetFont(); CFX_Matrix matrix = pTextObj->GetTextMatrix() * Obj.m_formMatrix; for (size_t k = 0; k < actText.GetLength(); ++k) { wchar_t wChar = actText[k]; if (wChar <= 0x80 && !isprint(wChar)) wChar = 0x20; if (wChar >= 0xFFFD) continue; PAGECHAR_INFO charinfo; charinfo.m_Origin = pTextObj->GetPos(); charinfo.m_Index = m_TextBuf.GetLength(); charinfo.m_Unicode = wChar; charinfo.m_CharCode = pFont->CharCodeFromUnicode(wChar); charinfo.m_Flag = FPDFTEXT_CHAR_PIECE; charinfo.m_pTextObj = pTextObj; charinfo.m_CharBox = pTextObj->GetRect(); charinfo.m_Matrix = matrix; m_TempTextBuf.AppendChar(wChar); m_TempCharList.push_back(charinfo); } }
void CTextPage::ProcessObject(CPDF_PageObject* pObject) { if (pObject->m_Type != PDFPAGE_TEXT) { return; } CPDF_TextObject* pText = (CPDF_TextObject*)pObject; CPDF_Font* pFont = pText->m_TextState.GetFont(); int count = pText->CountItems(); FX_FLOAT* pPosArray = FX_Alloc(FX_FLOAT, count * 2); if (pPosArray) { pText->CalcCharPos(pPosArray); } FX_FLOAT fontsize_h = pText->m_TextState.GetFontSizeH(); FX_FLOAT fontsize_v = pText->m_TextState.GetFontSizeV(); FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' '); FX_FLOAT spacew = 0; if (space_charcode != -1) { spacew = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000; } if (spacew == 0) { spacew = fontsize_h / 4; } if (pText->m_TextState.GetBaselineAngle() != 0) { int cc = 0; CFX_AffineMatrix matrix; pText->GetTextMatrix(&matrix); for (int i = 0; i < pText->m_nChars; i ++) { FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pText->m_pCharCodes : pText->m_pCharCodes[i]; if (charcode == (FX_DWORD) - 1) { continue; } FX_RECT char_box; pFont->GetCharBBox(charcode, char_box); FX_FLOAT char_left = pPosArray ? pPosArray[cc * 2] : char_box.left * pText->m_TextState.GetFontSize() / 1000; FX_FLOAT char_right = pPosArray ? pPosArray[cc * 2 + 1] : char_box.right * pText->m_TextState.GetFontSize() / 1000; FX_FLOAT char_top = char_box.top * pText->m_TextState.GetFontSize() / 1000; FX_FLOAT char_bottom = char_box.bottom * pText->m_TextState.GetFontSize() / 1000; cc ++; FX_FLOAT char_origx, char_origy; matrix.Transform(char_left, 0, char_origx, char_origy); matrix.TransformRect(char_left, char_right, char_top, char_bottom); CFX_ByteString str; pFont->AppendChar(str, charcode); InsertTextBox(NULL, char_origy, char_left, char_right, char_top, char_bottom, spacew, fontsize_v, str, pFont); } if (pPosArray) { FX_Free(pPosArray); } return; } FX_FLOAT ratio_h = fontsize_h / pText->m_TextState.GetFontSize(); for (int ii = 0; ii < count * 2; ii ++) { pPosArray[ii] *= ratio_h; } FX_FLOAT baseline = pText->m_PosY; CTextBaseLine* pBaseLine = NULL; FX_FLOAT topy = pText->m_Top; FX_FLOAT bottomy = pText->m_Bottom; FX_FLOAT leftx = pText->m_Left; int cc = 0; CFX_ByteString segment; int space_count = 0; FX_FLOAT last_left = 0, last_right = 0, segment_left = 0, segment_right = 0; for (int i = 0; i < pText->m_nChars; i ++) { FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pText->m_pCharCodes : pText->m_pCharCodes[i]; if (charcode == (FX_DWORD) - 1) { continue; } FX_FLOAT char_left = pPosArray[cc * 2]; FX_FLOAT char_right = pPosArray[cc * 2 + 1]; cc ++; if (char_left < last_left || (char_left - last_right) > spacew / 2) { pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right, topy, bottomy, spacew, fontsize_v, segment, pFont); segment_left = char_left; segment = ""; } if (space_count > 1) { pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right, topy, bottomy, spacew, fontsize_v, segment, pFont); segment = ""; } else if (space_count == 1) { pFont->AppendChar(segment, ' '); } if (segment.GetLength() == 0) { segment_left = char_left; } segment_right = char_right; pFont->AppendChar(segment, charcode); space_count = 0; last_left = char_left; last_right = char_right; } if (segment.GetLength()) pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right, topy, bottomy, spacew, fontsize_v, segment, pFont); FX_Free(pPosArray); }
void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj) { CPDF_TextObject* pTextObj = Obj.m_pTextObj.Get(); if (fabs(pTextObj->GetRect().Width()) < kSizeEpsilon) return; CFX_Matrix formMatrix = Obj.m_formMatrix; CPDF_Font* pFont = pTextObj->GetFont(); CFX_Matrix matrix = pTextObj->GetTextMatrix() * formMatrix; FPDFText_MarkedContent ePreMKC = PreMarkedContent(Obj); if (ePreMKC == FPDFText_MarkedContent::Done) { m_pPreTextObj = pTextObj; m_perMatrix = formMatrix; return; } GenerateCharacter result = GenerateCharacter::None; if (m_pPreTextObj) { result = ProcessInsertObject(pTextObj, formMatrix); if (result == GenerateCharacter::LineBreak) m_CurlineRect = Obj.m_pTextObj->GetRect(); else m_CurlineRect.Union(Obj.m_pTextObj->GetRect()); switch (result) { case GenerateCharacter::None: break; case GenerateCharacter::Space: { Optional<PAGECHAR_INFO> pGenerateChar = GenerateCharInfo(TEXT_SPACE_CHAR); if (pGenerateChar) { if (!formMatrix.IsIdentity()) pGenerateChar->m_Matrix = formMatrix; m_TempTextBuf.AppendChar(TEXT_SPACE_CHAR); m_TempCharList.push_back(*pGenerateChar); } break; } case GenerateCharacter::LineBreak: CloseTempLine(); if (m_TextBuf.GetSize()) { AppendGeneratedCharacter(TEXT_RETURN_CHAR, formMatrix); AppendGeneratedCharacter(TEXT_LINEFEED_CHAR, formMatrix); } break; case GenerateCharacter::Hyphen: if (pTextObj->CountChars() == 1) { CPDF_TextObjectItem item; pTextObj->GetCharInfo(0, &item); WideString wstrItem = pTextObj->GetFont()->UnicodeFromCharCode(item.m_CharCode); if (wstrItem.IsEmpty()) wstrItem += (wchar_t)item.m_CharCode; wchar_t curChar = wstrItem[0]; if (IsHyphenCode(curChar)) return; } while (m_TempTextBuf.GetSize() > 0 && m_TempTextBuf.AsStringView()[m_TempTextBuf.GetLength() - 1] == 0x20) { m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1); m_TempCharList.pop_back(); } PAGECHAR_INFO* charinfo = &m_TempCharList.back(); m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1); charinfo->m_Unicode = 0x2; charinfo->m_Flag = FPDFTEXT_CHAR_HYPHEN; m_TempTextBuf.AppendChar(0xfffe); break; } } else { m_CurlineRect = Obj.m_pTextObj->GetRect(); } if (ePreMKC == FPDFText_MarkedContent::Delay) { ProcessMarkedContent(Obj); m_pPreTextObj = pTextObj; m_perMatrix = formMatrix; return; } m_pPreTextObj = pTextObj; m_perMatrix = formMatrix; float baseSpace = CalculateBaseSpace(pTextObj, matrix); const bool bR2L = IsRightToLeft(*pTextObj, *pFont); const bool bIsBidiAndMirrorInverse = bR2L && (matrix.a * matrix.d - matrix.b * matrix.c) < 0; int32_t iBufStartAppend = m_TempTextBuf.GetLength(); int32_t iCharListStartAppend = pdfium::CollectionSize<int32_t>(m_TempCharList); float spacing = 0; const size_t nItems = pTextObj->CountItems(); for (size_t i = 0; i < nItems; ++i) { CPDF_TextObjectItem item; PAGECHAR_INFO charinfo; pTextObj->GetItemInfo(i, &item); if (item.m_CharCode == static_cast<uint32_t>(-1)) { WideString str = m_TempTextBuf.MakeString(); if (str.IsEmpty()) str = m_TextBuf.AsStringView(); if (str.IsEmpty() || str[str.GetLength() - 1] == TEXT_SPACE_CHAR) continue; float fontsize_h = pTextObj->m_TextState.GetFontSizeH(); spacing = -fontsize_h * item.m_Origin.x / 1000; continue; } float charSpace = pTextObj->m_TextState.GetCharSpace(); if (charSpace > 0.001) spacing += matrix.TransformDistance(charSpace); else if (charSpace < -0.001) spacing -= matrix.TransformDistance(fabs(charSpace)); spacing -= baseSpace; if (spacing && i > 0) { float fontsize_h = pTextObj->m_TextState.GetFontSizeH(); uint32_t space_charcode = pFont->CharCodeFromUnicode(' '); float threshold = 0; if (space_charcode != CPDF_Font::kInvalidCharCode) threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000; if (threshold > fontsize_h / 3) threshold = 0; else threshold /= 2; if (threshold == 0) { threshold = static_cast<float>(GetCharWidth(item.m_CharCode, pFont)); threshold = NormalizeThreshold(threshold, 300, 500, 700); threshold = fontsize_h * threshold / 1000; } if (threshold && (spacing && spacing >= threshold)) { charinfo.m_Unicode = TEXT_SPACE_CHAR; charinfo.m_Flag = FPDFTEXT_CHAR_GENERATED; charinfo.m_pTextObj = pTextObj; charinfo.m_Index = m_TextBuf.GetLength(); m_TempTextBuf.AppendChar(TEXT_SPACE_CHAR); charinfo.m_CharCode = CPDF_Font::kInvalidCharCode; charinfo.m_Matrix = formMatrix; charinfo.m_Origin = matrix.Transform(item.m_Origin); charinfo.m_CharBox = CFX_FloatRect(charinfo.m_Origin.x, charinfo.m_Origin.y, charinfo.m_Origin.x, charinfo.m_Origin.y); m_TempCharList.push_back(charinfo); } if (item.m_CharCode == CPDF_Font::kInvalidCharCode) continue; } spacing = 0; WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode); bool bNoUnicode = false; if (wstrItem.IsEmpty() && item.m_CharCode) { wstrItem += static_cast<wchar_t>(item.m_CharCode); bNoUnicode = true; } charinfo.m_Index = -1; charinfo.m_CharCode = item.m_CharCode; charinfo.m_Flag = bNoUnicode ? FPDFTEXT_CHAR_UNUNICODE : FPDFTEXT_CHAR_NORMAL; charinfo.m_pTextObj = pTextObj; charinfo.m_Origin = matrix.Transform(item.m_Origin); const FX_RECT rect = charinfo.m_pTextObj->GetFont()->GetCharBBox(charinfo.m_CharCode); const float fFontSize = pTextObj->GetFontSize() / 1000; charinfo.m_CharBox.top = rect.top * fFontSize + item.m_Origin.y; charinfo.m_CharBox.left = rect.left * fFontSize + item.m_Origin.x; charinfo.m_CharBox.right = rect.right * fFontSize + item.m_Origin.x; charinfo.m_CharBox.bottom = rect.bottom * fFontSize + item.m_Origin.y; if (fabsf(charinfo.m_CharBox.top - charinfo.m_CharBox.bottom) < kSizeEpsilon) { charinfo.m_CharBox.top = charinfo.m_CharBox.bottom + pTextObj->GetFontSize(); } if (fabsf(charinfo.m_CharBox.right - charinfo.m_CharBox.left) < kSizeEpsilon) { charinfo.m_CharBox.right = charinfo.m_CharBox.left + pTextObj->GetCharWidth(charinfo.m_CharCode); } charinfo.m_CharBox = matrix.TransformRect(charinfo.m_CharBox); charinfo.m_Matrix = matrix; if (wstrItem.IsEmpty()) { charinfo.m_Unicode = 0; m_TempCharList.push_back(charinfo); m_TempTextBuf.AppendChar(0xfffe); continue; } int nTotal = wstrItem.GetLength(); bool bDel = false; const int count = std::min(pdfium::CollectionSize<int>(m_TempCharList), 7); float threshold = charinfo.m_Matrix.TransformXDistance( static_cast<float>(TEXT_CHARRATIO_GAPDELTA) * pTextObj->GetFontSize()); for (int n = pdfium::CollectionSize<int>(m_TempCharList); n > pdfium::CollectionSize<int>(m_TempCharList) - count; --n) { const PAGECHAR_INFO& charinfo1 = m_TempCharList[n - 1]; CFX_PointF diff = charinfo1.m_Origin - charinfo.m_Origin; if (charinfo1.m_CharCode == charinfo.m_CharCode && charinfo1.m_pTextObj->GetFont() == charinfo.m_pTextObj->GetFont() && fabs(diff.x) < threshold && fabs(diff.y) < threshold) { bDel = true; break; } } if (!bDel) { for (int nIndex = 0; nIndex < nTotal; ++nIndex) { charinfo.m_Unicode = wstrItem[nIndex]; if (charinfo.m_Unicode) { charinfo.m_Index = m_TextBuf.GetLength(); m_TempTextBuf.AppendChar(charinfo.m_Unicode); } else { m_TempTextBuf.AppendChar(0xfffe); } m_TempCharList.push_back(charinfo); } } else if (i == 0) { WideString str = m_TempTextBuf.MakeString(); if (!str.IsEmpty() && str[str.GetLength() - 1] == TEXT_SPACE_CHAR) { m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1); m_TempCharList.pop_back(); } } } if (bIsBidiAndMirrorInverse) SwapTempTextBuf(iCharListStartAppend, iBufStartAppend); }
FX_BOOL CPDF_TextStream::ProcessObject(const CPDF_TextObject* pObj, FX_BOOL bFirstLine) { CPDF_Font* pFont = pObj->GetFont(); CFX_AffineMatrix matrix; pObj->GetTextMatrix(&matrix); int item_index = 0; if (m_pLastObj) { int result = FPDFText_ProcessInterObj(m_pLastObj, pObj); if (result == 2) { int len = m_Buffer.GetLength(); if (len && m_bUseLF && m_Buffer.GetBuffer()[len - 1] == L'-') { m_Buffer.Delete(len - 1, 1); if (m_pObjArray) { m_pObjArray->RemoveAt((len - 1) * 2, 2); } } else { if (bFirstLine) { return TRUE; } if (m_bUseLF) { m_Buffer.AppendChar(L'\r'); m_Buffer.AppendChar(L'\n'); if (m_pObjArray) { for (int i = 0; i < 4; i ++) { m_pObjArray->Add(NULL); } } } else { m_Buffer.AppendChar(' '); if (m_pObjArray) { m_pObjArray->Add(NULL); m_pObjArray->Add(NULL); } } } } else if (result == 1) { m_Buffer.AppendChar(L' '); if (m_pObjArray) { m_pObjArray->Add(NULL); m_pObjArray->Add(NULL); } } else if (result == -1) { m_pLastObj = pObj; return FALSE; } else if (result == 3) { item_index = 1; } } m_pLastObj = pObj; int nItems = pObj->CountItems(); FX_FLOAT Ignorekerning = 0; for(int i = 1; i < nItems - 1; i += 2) { CPDF_TextObjectItem item; pObj->GetItemInfo(i, &item); if (item.m_CharCode == (FX_DWORD) - 1) { if(i == 1) { Ignorekerning = item.m_OriginX; } else if(Ignorekerning > item.m_OriginX) { Ignorekerning = item.m_OriginX; } } else { Ignorekerning = 0; break; } } FX_FLOAT spacing = 0; for (; item_index < nItems; item_index ++) { CPDF_TextObjectItem item; pObj->GetItemInfo(item_index, &item); if (item.m_CharCode == (FX_DWORD) - 1) { CFX_WideString wstr = m_Buffer.GetWideString(); if (wstr.IsEmpty() || wstr.GetAt(wstr.GetLength() - 1) == L' ') { continue; } FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH(); spacing = -fontsize_h * (item.m_OriginX - Ignorekerning) / 1000; continue; } FX_FLOAT charSpace = pObj->m_TextState.GetObject()->m_CharSpace; if(nItems > 3 && !spacing) { charSpace = 0; } if((spacing || charSpace) && item_index > 0) { int last_width = 0; FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH(); FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' '); FX_FLOAT threshold = 0; if (space_charcode != -1) { threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000 ; } if(threshold > fontsize_h / 3) { threshold = 0; } else { threshold /= 2; } if (threshold == 0) { threshold = fontsize_h; int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont)); threshold = this_width > last_width ? (FX_FLOAT)this_width : (FX_FLOAT)last_width; int nDivide = 6; if (threshold < 300) { nDivide = 2; } else if (threshold < 500) { nDivide = 4; } else if (threshold < 700) { nDivide = 5; } threshold = threshold / nDivide; threshold = fontsize_h * threshold / 1000; } if(charSpace > 0.001) { spacing += matrix.TransformDistance(charSpace); } else if(charSpace < -0.001) { spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace)); } if (threshold && (spacing && spacing >= threshold) ) { m_Buffer.AppendChar(L' '); if (m_pObjArray) { m_pObjArray->Add(NULL); m_pObjArray->Add(NULL); } } if (item.m_CharCode == (FX_DWORD) - 1) { continue; } spacing = 0; } CFX_WideString unicode_str = pFont->UnicodeFromCharCode(item.m_CharCode); if (unicode_str.IsEmpty()) { m_Buffer.AppendChar((FX_WCHAR)item.m_CharCode); if (m_pObjArray) { m_pObjArray->Add((void*)pObj); m_pObjArray->Add((void*)(FX_INTPTR)item_index); } } else { m_Buffer << unicode_str; if (m_pObjArray) { for (int i = 0; i < unicode_str.GetLength(); i ++) { m_pObjArray->Add((void*)pObj); m_pObjArray->Add((void*)(FX_INTPTR)item_index); } } } } return FALSE; }