int Document::CountWords(CPDF_TextObject* pTextObj) { if (!pTextObj) return 0; int nWords = 0; CPDF_Font* pFont = pTextObj->GetFont(); if (!pFont) return 0; FX_BOOL bIsLatin = FALSE; for (int i=0, sz=pTextObj->CountChars(); i<sz; i++) { FX_DWORD charcode = -1; FX_FLOAT kerning; pTextObj->GetCharInfo(i, charcode, kerning); CFX_WideString swUnicode = pFont->UnicodeFromCharCode(charcode); FX_WORD unicode = 0; if (swUnicode.GetLength() > 0) unicode = swUnicode[0]; if (ISLATINWORD(unicode) && bIsLatin) continue; bIsLatin = ISLATINWORD(unicode); if (unicode != 0x20) nWords++; } return nWords; }
CFX_WideString Document::GetObjWordStr(CPDF_TextObject* pTextObj, int nWordIndex) { ASSERT(pTextObj != NULL); CFX_WideString swRet; CPDF_Font* pFont = pTextObj->GetFont(); if (!pFont) return L""; int nWords = 0; FX_BOOL bIsLatin = FALSE; for (int i=0, sz=pTextObj->CountChars(); i<sz; i++) { FX_DWORD charcode = -1; FX_FLOAT kerning; pTextObj->GetCharInfo(i, charcode, kerning); CFX_WideString swUnicode = pFont->UnicodeFromCharCode(charcode); FX_WORD unicode = 0; if (swUnicode.GetLength() > 0) unicode = swUnicode[0]; if (ISLATINWORD(unicode) && bIsLatin) { } else { bIsLatin = ISLATINWORD(unicode); if (unicode != 0x20) nWords++; } if (nWords-1 == nWordIndex) swRet += unicode; } return swRet; }
void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj) { CPDF_TextObject* pTextObj = Obj.m_pTextObj.Get(); if (fabs(pTextObj->GetRect().Width()) < kSizeEpsilon) return; CFX_Matrix formMatrix = Obj.m_formMatrix; CPDF_Font* pFont = pTextObj->GetFont(); CFX_Matrix matrix = pTextObj->GetTextMatrix() * formMatrix; FPDFText_MarkedContent ePreMKC = PreMarkedContent(Obj); if (ePreMKC == FPDFText_MarkedContent::Done) { m_pPreTextObj = pTextObj; m_perMatrix = formMatrix; return; } GenerateCharacter result = GenerateCharacter::None; if (m_pPreTextObj) { result = ProcessInsertObject(pTextObj, formMatrix); if (result == GenerateCharacter::LineBreak) m_CurlineRect = Obj.m_pTextObj->GetRect(); else m_CurlineRect.Union(Obj.m_pTextObj->GetRect()); switch (result) { case GenerateCharacter::None: break; case GenerateCharacter::Space: { Optional<PAGECHAR_INFO> pGenerateChar = GenerateCharInfo(TEXT_SPACE_CHAR); if (pGenerateChar) { if (!formMatrix.IsIdentity()) pGenerateChar->m_Matrix = formMatrix; m_TempTextBuf.AppendChar(TEXT_SPACE_CHAR); m_TempCharList.push_back(*pGenerateChar); } break; } case GenerateCharacter::LineBreak: CloseTempLine(); if (m_TextBuf.GetSize()) { AppendGeneratedCharacter(TEXT_RETURN_CHAR, formMatrix); AppendGeneratedCharacter(TEXT_LINEFEED_CHAR, formMatrix); } break; case GenerateCharacter::Hyphen: if (pTextObj->CountChars() == 1) { CPDF_TextObjectItem item; pTextObj->GetCharInfo(0, &item); WideString wstrItem = pTextObj->GetFont()->UnicodeFromCharCode(item.m_CharCode); if (wstrItem.IsEmpty()) wstrItem += (wchar_t)item.m_CharCode; wchar_t curChar = wstrItem[0]; if (IsHyphenCode(curChar)) return; } while (m_TempTextBuf.GetSize() > 0 && m_TempTextBuf.AsStringView()[m_TempTextBuf.GetLength() - 1] == 0x20) { m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1); m_TempCharList.pop_back(); } PAGECHAR_INFO* charinfo = &m_TempCharList.back(); m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1); charinfo->m_Unicode = 0x2; charinfo->m_Flag = FPDFTEXT_CHAR_HYPHEN; m_TempTextBuf.AppendChar(0xfffe); break; } } else { m_CurlineRect = Obj.m_pTextObj->GetRect(); } if (ePreMKC == FPDFText_MarkedContent::Delay) { ProcessMarkedContent(Obj); m_pPreTextObj = pTextObj; m_perMatrix = formMatrix; return; } m_pPreTextObj = pTextObj; m_perMatrix = formMatrix; float baseSpace = CalculateBaseSpace(pTextObj, matrix); const bool bR2L = IsRightToLeft(*pTextObj, *pFont); const bool bIsBidiAndMirrorInverse = bR2L && (matrix.a * matrix.d - matrix.b * matrix.c) < 0; int32_t iBufStartAppend = m_TempTextBuf.GetLength(); int32_t iCharListStartAppend = pdfium::CollectionSize<int32_t>(m_TempCharList); float spacing = 0; const size_t nItems = pTextObj->CountItems(); for (size_t i = 0; i < nItems; ++i) { CPDF_TextObjectItem item; PAGECHAR_INFO charinfo; pTextObj->GetItemInfo(i, &item); if (item.m_CharCode == static_cast<uint32_t>(-1)) { WideString str = m_TempTextBuf.MakeString(); if (str.IsEmpty()) str = m_TextBuf.AsStringView(); if (str.IsEmpty() || str[str.GetLength() - 1] == TEXT_SPACE_CHAR) continue; float fontsize_h = pTextObj->m_TextState.GetFontSizeH(); spacing = -fontsize_h * item.m_Origin.x / 1000; continue; } float charSpace = pTextObj->m_TextState.GetCharSpace(); if (charSpace > 0.001) spacing += matrix.TransformDistance(charSpace); else if (charSpace < -0.001) spacing -= matrix.TransformDistance(fabs(charSpace)); spacing -= baseSpace; if (spacing && i > 0) { float fontsize_h = pTextObj->m_TextState.GetFontSizeH(); uint32_t space_charcode = pFont->CharCodeFromUnicode(' '); float threshold = 0; if (space_charcode != CPDF_Font::kInvalidCharCode) threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000; if (threshold > fontsize_h / 3) threshold = 0; else threshold /= 2; if (threshold == 0) { threshold = static_cast<float>(GetCharWidth(item.m_CharCode, pFont)); threshold = NormalizeThreshold(threshold, 300, 500, 700); threshold = fontsize_h * threshold / 1000; } if (threshold && (spacing && spacing >= threshold)) { charinfo.m_Unicode = TEXT_SPACE_CHAR; charinfo.m_Flag = FPDFTEXT_CHAR_GENERATED; charinfo.m_pTextObj = pTextObj; charinfo.m_Index = m_TextBuf.GetLength(); m_TempTextBuf.AppendChar(TEXT_SPACE_CHAR); charinfo.m_CharCode = CPDF_Font::kInvalidCharCode; charinfo.m_Matrix = formMatrix; charinfo.m_Origin = matrix.Transform(item.m_Origin); charinfo.m_CharBox = CFX_FloatRect(charinfo.m_Origin.x, charinfo.m_Origin.y, charinfo.m_Origin.x, charinfo.m_Origin.y); m_TempCharList.push_back(charinfo); } if (item.m_CharCode == CPDF_Font::kInvalidCharCode) continue; } spacing = 0; WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode); bool bNoUnicode = false; if (wstrItem.IsEmpty() && item.m_CharCode) { wstrItem += static_cast<wchar_t>(item.m_CharCode); bNoUnicode = true; } charinfo.m_Index = -1; charinfo.m_CharCode = item.m_CharCode; charinfo.m_Flag = bNoUnicode ? FPDFTEXT_CHAR_UNUNICODE : FPDFTEXT_CHAR_NORMAL; charinfo.m_pTextObj = pTextObj; charinfo.m_Origin = matrix.Transform(item.m_Origin); const FX_RECT rect = charinfo.m_pTextObj->GetFont()->GetCharBBox(charinfo.m_CharCode); const float fFontSize = pTextObj->GetFontSize() / 1000; charinfo.m_CharBox.top = rect.top * fFontSize + item.m_Origin.y; charinfo.m_CharBox.left = rect.left * fFontSize + item.m_Origin.x; charinfo.m_CharBox.right = rect.right * fFontSize + item.m_Origin.x; charinfo.m_CharBox.bottom = rect.bottom * fFontSize + item.m_Origin.y; if (fabsf(charinfo.m_CharBox.top - charinfo.m_CharBox.bottom) < kSizeEpsilon) { charinfo.m_CharBox.top = charinfo.m_CharBox.bottom + pTextObj->GetFontSize(); } if (fabsf(charinfo.m_CharBox.right - charinfo.m_CharBox.left) < kSizeEpsilon) { charinfo.m_CharBox.right = charinfo.m_CharBox.left + pTextObj->GetCharWidth(charinfo.m_CharCode); } charinfo.m_CharBox = matrix.TransformRect(charinfo.m_CharBox); charinfo.m_Matrix = matrix; if (wstrItem.IsEmpty()) { charinfo.m_Unicode = 0; m_TempCharList.push_back(charinfo); m_TempTextBuf.AppendChar(0xfffe); continue; } int nTotal = wstrItem.GetLength(); bool bDel = false; const int count = std::min(pdfium::CollectionSize<int>(m_TempCharList), 7); float threshold = charinfo.m_Matrix.TransformXDistance( static_cast<float>(TEXT_CHARRATIO_GAPDELTA) * pTextObj->GetFontSize()); for (int n = pdfium::CollectionSize<int>(m_TempCharList); n > pdfium::CollectionSize<int>(m_TempCharList) - count; --n) { const PAGECHAR_INFO& charinfo1 = m_TempCharList[n - 1]; CFX_PointF diff = charinfo1.m_Origin - charinfo.m_Origin; if (charinfo1.m_CharCode == charinfo.m_CharCode && charinfo1.m_pTextObj->GetFont() == charinfo.m_pTextObj->GetFont() && fabs(diff.x) < threshold && fabs(diff.y) < threshold) { bDel = true; break; } } if (!bDel) { for (int nIndex = 0; nIndex < nTotal; ++nIndex) { charinfo.m_Unicode = wstrItem[nIndex]; if (charinfo.m_Unicode) { charinfo.m_Index = m_TextBuf.GetLength(); m_TempTextBuf.AppendChar(charinfo.m_Unicode); } else { m_TempTextBuf.AppendChar(0xfffe); } m_TempCharList.push_back(charinfo); } } else if (i == 0) { WideString str = m_TempTextBuf.MakeString(); if (!str.IsEmpty() && str[str.GetLength() - 1] == TEXT_SPACE_CHAR) { m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1); m_TempCharList.pop_back(); } } } if (bIsBidiAndMirrorInverse) SwapTempTextBuf(iCharListStartAppend, iBufStartAppend); }
FX_BOOL CPDF_TextStream::ProcessObject(const CPDF_TextObject* pObj, FX_BOOL bFirstLine) { CPDF_Font* pFont = pObj->GetFont(); CFX_AffineMatrix matrix; pObj->GetTextMatrix(&matrix); int item_index = 0; if (m_pLastObj) { int result = FPDFText_ProcessInterObj(m_pLastObj, pObj); if (result == 2) { int len = m_Buffer.GetLength(); if (len && m_bUseLF && m_Buffer.GetBuffer()[len - 1] == L'-') { m_Buffer.Delete(len - 1, 1); if (m_pObjArray) { m_pObjArray->RemoveAt((len - 1) * 2, 2); } } else { if (bFirstLine) { return TRUE; } if (m_bUseLF) { m_Buffer.AppendChar(L'\r'); m_Buffer.AppendChar(L'\n'); if (m_pObjArray) { for (int i = 0; i < 4; i ++) { m_pObjArray->Add(NULL); } } } else { m_Buffer.AppendChar(' '); if (m_pObjArray) { m_pObjArray->Add(NULL); m_pObjArray->Add(NULL); } } } } else if (result == 1) { m_Buffer.AppendChar(L' '); if (m_pObjArray) { m_pObjArray->Add(NULL); m_pObjArray->Add(NULL); } } else if (result == -1) { m_pLastObj = pObj; return FALSE; } else if (result == 3) { item_index = 1; } } m_pLastObj = pObj; int nItems = pObj->CountItems(); FX_FLOAT Ignorekerning = 0; for(int i = 1; i < nItems - 1; i += 2) { CPDF_TextObjectItem item; pObj->GetItemInfo(i, &item); if (item.m_CharCode == (FX_DWORD) - 1) { if(i == 1) { Ignorekerning = item.m_OriginX; } else if(Ignorekerning > item.m_OriginX) { Ignorekerning = item.m_OriginX; } } else { Ignorekerning = 0; break; } } FX_FLOAT spacing = 0; for (; item_index < nItems; item_index ++) { CPDF_TextObjectItem item; pObj->GetItemInfo(item_index, &item); if (item.m_CharCode == (FX_DWORD) - 1) { CFX_WideString wstr = m_Buffer.GetWideString(); if (wstr.IsEmpty() || wstr.GetAt(wstr.GetLength() - 1) == L' ') { continue; } FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH(); spacing = -fontsize_h * (item.m_OriginX - Ignorekerning) / 1000; continue; } FX_FLOAT charSpace = pObj->m_TextState.GetObject()->m_CharSpace; if(nItems > 3 && !spacing) { charSpace = 0; } if((spacing || charSpace) && item_index > 0) { int last_width = 0; FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH(); FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' '); FX_FLOAT threshold = 0; if (space_charcode != -1) { threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000 ; } if(threshold > fontsize_h / 3) { threshold = 0; } else { threshold /= 2; } if (threshold == 0) { threshold = fontsize_h; int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont)); threshold = this_width > last_width ? (FX_FLOAT)this_width : (FX_FLOAT)last_width; int nDivide = 6; if (threshold < 300) { nDivide = 2; } else if (threshold < 500) { nDivide = 4; } else if (threshold < 700) { nDivide = 5; } threshold = threshold / nDivide; threshold = fontsize_h * threshold / 1000; } if(charSpace > 0.001) { spacing += matrix.TransformDistance(charSpace); } else if(charSpace < -0.001) { spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace)); } if (threshold && (spacing && spacing >= threshold) ) { m_Buffer.AppendChar(L' '); if (m_pObjArray) { m_pObjArray->Add(NULL); m_pObjArray->Add(NULL); } } if (item.m_CharCode == (FX_DWORD) - 1) { continue; } spacing = 0; } CFX_WideString unicode_str = pFont->UnicodeFromCharCode(item.m_CharCode); if (unicode_str.IsEmpty()) { m_Buffer.AppendChar((FX_WCHAR)item.m_CharCode); if (m_pObjArray) { m_pObjArray->Add((void*)pObj); m_pObjArray->Add((void*)(FX_INTPTR)item_index); } } else { m_Buffer << unicode_str; if (m_pObjArray) { for (int i = 0; i < unicode_str.GetLength(); i ++) { m_pObjArray->Add((void*)pObj); m_pObjArray->Add((void*)(FX_INTPTR)item_index); } } } } return FALSE; }