示例#1
0
int	Document::CountWords(CPDF_TextObject* pTextObj)
{
	if (!pTextObj) return 0;

	int nWords = 0;

	CPDF_Font* pFont = pTextObj->GetFont();
	if (!pFont) return 0;

	FX_BOOL bIsLatin = FALSE;

	for (int i=0, sz=pTextObj->CountChars(); i<sz; i++)
	{
		FX_DWORD charcode = -1;
		FX_FLOAT kerning;

		pTextObj->GetCharInfo(i, charcode, kerning);
		CFX_WideString swUnicode = pFont->UnicodeFromCharCode(charcode);

		FX_WORD unicode = 0;
		if (swUnicode.GetLength() > 0)
			unicode = swUnicode[0];

		if (ISLATINWORD(unicode) && bIsLatin)
			continue;

		bIsLatin = ISLATINWORD(unicode);
		if (unicode != 0x20)
			nWords++;
	}

	return nWords;
}
示例#2
0
CFX_WideString Document::GetObjWordStr(CPDF_TextObject* pTextObj, int nWordIndex)
{
	ASSERT(pTextObj != NULL);

	CFX_WideString swRet;

	CPDF_Font* pFont = pTextObj->GetFont();
	if (!pFont) return L"";

	int nWords = 0;
	FX_BOOL bIsLatin = FALSE;

	for (int i=0, sz=pTextObj->CountChars(); i<sz; i++)
	{
		FX_DWORD charcode = -1;
		FX_FLOAT kerning;

		pTextObj->GetCharInfo(i, charcode, kerning);
		CFX_WideString swUnicode = pFont->UnicodeFromCharCode(charcode);

		FX_WORD unicode = 0;
		if (swUnicode.GetLength() > 0)
			unicode = swUnicode[0];

		if (ISLATINWORD(unicode) && bIsLatin)
		{
		}
		else
		{
			bIsLatin = ISLATINWORD(unicode);
			if (unicode != 0x20)
				nWords++;
		}

		if (nWords-1 == nWordIndex)
			swRet += unicode;
	}

	return swRet;
}
示例#3
0
void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj) {
  CPDF_TextObject* pTextObj = Obj.m_pTextObj.Get();
  if (fabs(pTextObj->GetRect().Width()) < kSizeEpsilon)
    return;
  CFX_Matrix formMatrix = Obj.m_formMatrix;
  CPDF_Font* pFont = pTextObj->GetFont();
  CFX_Matrix matrix = pTextObj->GetTextMatrix() * formMatrix;

  FPDFText_MarkedContent ePreMKC = PreMarkedContent(Obj);
  if (ePreMKC == FPDFText_MarkedContent::Done) {
    m_pPreTextObj = pTextObj;
    m_perMatrix = formMatrix;
    return;
  }
  GenerateCharacter result = GenerateCharacter::None;
  if (m_pPreTextObj) {
    result = ProcessInsertObject(pTextObj, formMatrix);
    if (result == GenerateCharacter::LineBreak)
      m_CurlineRect = Obj.m_pTextObj->GetRect();
    else
      m_CurlineRect.Union(Obj.m_pTextObj->GetRect());

    switch (result) {
      case GenerateCharacter::None:
        break;
      case GenerateCharacter::Space: {
        Optional<PAGECHAR_INFO> pGenerateChar =
            GenerateCharInfo(TEXT_SPACE_CHAR);
        if (pGenerateChar) {
          if (!formMatrix.IsIdentity())
            pGenerateChar->m_Matrix = formMatrix;
          m_TempTextBuf.AppendChar(TEXT_SPACE_CHAR);
          m_TempCharList.push_back(*pGenerateChar);
        }
        break;
      }
      case GenerateCharacter::LineBreak:
        CloseTempLine();
        if (m_TextBuf.GetSize()) {
          AppendGeneratedCharacter(TEXT_RETURN_CHAR, formMatrix);
          AppendGeneratedCharacter(TEXT_LINEFEED_CHAR, formMatrix);
        }
        break;
      case GenerateCharacter::Hyphen:
        if (pTextObj->CountChars() == 1) {
          CPDF_TextObjectItem item;
          pTextObj->GetCharInfo(0, &item);
          WideString wstrItem =
              pTextObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
          if (wstrItem.IsEmpty())
            wstrItem += (wchar_t)item.m_CharCode;
          wchar_t curChar = wstrItem[0];
          if (IsHyphenCode(curChar))
            return;
        }
        while (m_TempTextBuf.GetSize() > 0 &&
               m_TempTextBuf.AsStringView()[m_TempTextBuf.GetLength() - 1] ==
                   0x20) {
          m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
          m_TempCharList.pop_back();
        }
        PAGECHAR_INFO* charinfo = &m_TempCharList.back();
        m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
        charinfo->m_Unicode = 0x2;
        charinfo->m_Flag = FPDFTEXT_CHAR_HYPHEN;
        m_TempTextBuf.AppendChar(0xfffe);
        break;
    }
  } else {
    m_CurlineRect = Obj.m_pTextObj->GetRect();
  }

  if (ePreMKC == FPDFText_MarkedContent::Delay) {
    ProcessMarkedContent(Obj);
    m_pPreTextObj = pTextObj;
    m_perMatrix = formMatrix;
    return;
  }
  m_pPreTextObj = pTextObj;
  m_perMatrix = formMatrix;
  float baseSpace = CalculateBaseSpace(pTextObj, matrix);

  const bool bR2L = IsRightToLeft(*pTextObj, *pFont);
  const bool bIsBidiAndMirrorInverse =
      bR2L && (matrix.a * matrix.d - matrix.b * matrix.c) < 0;
  int32_t iBufStartAppend = m_TempTextBuf.GetLength();
  int32_t iCharListStartAppend =
      pdfium::CollectionSize<int32_t>(m_TempCharList);

  float spacing = 0;
  const size_t nItems = pTextObj->CountItems();
  for (size_t i = 0; i < nItems; ++i) {
    CPDF_TextObjectItem item;
    PAGECHAR_INFO charinfo;
    pTextObj->GetItemInfo(i, &item);
    if (item.m_CharCode == static_cast<uint32_t>(-1)) {
      WideString str = m_TempTextBuf.MakeString();
      if (str.IsEmpty())
        str = m_TextBuf.AsStringView();
      if (str.IsEmpty() || str[str.GetLength() - 1] == TEXT_SPACE_CHAR)
        continue;

      float fontsize_h = pTextObj->m_TextState.GetFontSizeH();
      spacing = -fontsize_h * item.m_Origin.x / 1000;
      continue;
    }
    float charSpace = pTextObj->m_TextState.GetCharSpace();
    if (charSpace > 0.001)
      spacing += matrix.TransformDistance(charSpace);
    else if (charSpace < -0.001)
      spacing -= matrix.TransformDistance(fabs(charSpace));
    spacing -= baseSpace;
    if (spacing && i > 0) {
      float fontsize_h = pTextObj->m_TextState.GetFontSizeH();
      uint32_t space_charcode = pFont->CharCodeFromUnicode(' ');
      float threshold = 0;
      if (space_charcode != CPDF_Font::kInvalidCharCode)
        threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
      if (threshold > fontsize_h / 3)
        threshold = 0;
      else
        threshold /= 2;
      if (threshold == 0) {
        threshold = static_cast<float>(GetCharWidth(item.m_CharCode, pFont));
        threshold = NormalizeThreshold(threshold, 300, 500, 700);
        threshold = fontsize_h * threshold / 1000;
      }
      if (threshold && (spacing && spacing >= threshold)) {
        charinfo.m_Unicode = TEXT_SPACE_CHAR;
        charinfo.m_Flag = FPDFTEXT_CHAR_GENERATED;
        charinfo.m_pTextObj = pTextObj;
        charinfo.m_Index = m_TextBuf.GetLength();
        m_TempTextBuf.AppendChar(TEXT_SPACE_CHAR);
        charinfo.m_CharCode = CPDF_Font::kInvalidCharCode;
        charinfo.m_Matrix = formMatrix;
        charinfo.m_Origin = matrix.Transform(item.m_Origin);
        charinfo.m_CharBox =
            CFX_FloatRect(charinfo.m_Origin.x, charinfo.m_Origin.y,
                          charinfo.m_Origin.x, charinfo.m_Origin.y);
        m_TempCharList.push_back(charinfo);
      }
      if (item.m_CharCode == CPDF_Font::kInvalidCharCode)
        continue;
    }
    spacing = 0;
    WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode);
    bool bNoUnicode = false;
    if (wstrItem.IsEmpty() && item.m_CharCode) {
      wstrItem += static_cast<wchar_t>(item.m_CharCode);
      bNoUnicode = true;
    }
    charinfo.m_Index = -1;
    charinfo.m_CharCode = item.m_CharCode;
    charinfo.m_Flag =
        bNoUnicode ? FPDFTEXT_CHAR_UNUNICODE : FPDFTEXT_CHAR_NORMAL;
    charinfo.m_pTextObj = pTextObj;
    charinfo.m_Origin = matrix.Transform(item.m_Origin);

    const FX_RECT rect =
        charinfo.m_pTextObj->GetFont()->GetCharBBox(charinfo.m_CharCode);
    const float fFontSize = pTextObj->GetFontSize() / 1000;
    charinfo.m_CharBox.top = rect.top * fFontSize + item.m_Origin.y;
    charinfo.m_CharBox.left = rect.left * fFontSize + item.m_Origin.x;
    charinfo.m_CharBox.right = rect.right * fFontSize + item.m_Origin.x;
    charinfo.m_CharBox.bottom = rect.bottom * fFontSize + item.m_Origin.y;
    if (fabsf(charinfo.m_CharBox.top - charinfo.m_CharBox.bottom) <
        kSizeEpsilon) {
      charinfo.m_CharBox.top =
          charinfo.m_CharBox.bottom + pTextObj->GetFontSize();
    }
    if (fabsf(charinfo.m_CharBox.right - charinfo.m_CharBox.left) <
        kSizeEpsilon) {
      charinfo.m_CharBox.right =
          charinfo.m_CharBox.left + pTextObj->GetCharWidth(charinfo.m_CharCode);
    }
    charinfo.m_CharBox = matrix.TransformRect(charinfo.m_CharBox);
    charinfo.m_Matrix = matrix;
    if (wstrItem.IsEmpty()) {
      charinfo.m_Unicode = 0;
      m_TempCharList.push_back(charinfo);
      m_TempTextBuf.AppendChar(0xfffe);
      continue;
    }
    int nTotal = wstrItem.GetLength();
    bool bDel = false;
    const int count = std::min(pdfium::CollectionSize<int>(m_TempCharList), 7);
    float threshold = charinfo.m_Matrix.TransformXDistance(
        static_cast<float>(TEXT_CHARRATIO_GAPDELTA) * pTextObj->GetFontSize());
    for (int n = pdfium::CollectionSize<int>(m_TempCharList);
         n > pdfium::CollectionSize<int>(m_TempCharList) - count; --n) {
      const PAGECHAR_INFO& charinfo1 = m_TempCharList[n - 1];
      CFX_PointF diff = charinfo1.m_Origin - charinfo.m_Origin;
      if (charinfo1.m_CharCode == charinfo.m_CharCode &&
          charinfo1.m_pTextObj->GetFont() == charinfo.m_pTextObj->GetFont() &&
          fabs(diff.x) < threshold && fabs(diff.y) < threshold) {
        bDel = true;
        break;
      }
    }
    if (!bDel) {
      for (int nIndex = 0; nIndex < nTotal; ++nIndex) {
        charinfo.m_Unicode = wstrItem[nIndex];
        if (charinfo.m_Unicode) {
          charinfo.m_Index = m_TextBuf.GetLength();
          m_TempTextBuf.AppendChar(charinfo.m_Unicode);
        } else {
          m_TempTextBuf.AppendChar(0xfffe);
        }
        m_TempCharList.push_back(charinfo);
      }
    } else if (i == 0) {
      WideString str = m_TempTextBuf.MakeString();
      if (!str.IsEmpty() && str[str.GetLength() - 1] == TEXT_SPACE_CHAR) {
        m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
        m_TempCharList.pop_back();
      }
    }
  }
  if (bIsBidiAndMirrorInverse)
    SwapTempTextBuf(iCharListStartAppend, iBufStartAppend);
}
示例#4
0
FX_BOOL CPDF_TextStream::ProcessObject(const CPDF_TextObject* pObj, FX_BOOL bFirstLine)
{
    CPDF_Font* pFont = pObj->GetFont();
    CFX_AffineMatrix matrix;
    pObj->GetTextMatrix(&matrix);
    int item_index = 0;
    if (m_pLastObj) {
        int result = FPDFText_ProcessInterObj(m_pLastObj, pObj);
        if (result == 2) {
            int len = m_Buffer.GetLength();
            if (len && m_bUseLF && m_Buffer.GetBuffer()[len - 1] == L'-') {
                m_Buffer.Delete(len - 1, 1);
                if (m_pObjArray) {
                    m_pObjArray->RemoveAt((len - 1) * 2, 2);
                }
            } else {
                if (bFirstLine) {
                    return TRUE;
                }
                if (m_bUseLF) {
                    m_Buffer.AppendChar(L'\r');
                    m_Buffer.AppendChar(L'\n');
                    if (m_pObjArray) {
                        for (int i = 0; i < 4; i ++) {
                            m_pObjArray->Add(NULL);
                        }
                    }
                } else {
                    m_Buffer.AppendChar(' ');
                    if (m_pObjArray) {
                        m_pObjArray->Add(NULL);
                        m_pObjArray->Add(NULL);
                    }
                }
            }
        } else if (result == 1) {
            m_Buffer.AppendChar(L' ');
            if (m_pObjArray) {
                m_pObjArray->Add(NULL);
                m_pObjArray->Add(NULL);
            }
        } else if (result == -1) {
            m_pLastObj = pObj;
            return FALSE;
        } else if (result == 3) {
            item_index = 1;
        }
    }
    m_pLastObj = pObj;
    int nItems = pObj->CountItems();
    FX_FLOAT Ignorekerning = 0;
    for(int i = 1; i < nItems - 1; i += 2) {
        CPDF_TextObjectItem item;
        pObj->GetItemInfo(i, &item);
        if (item.m_CharCode == (FX_DWORD) - 1) {
            if(i == 1) {
                Ignorekerning = item.m_OriginX;
            } else if(Ignorekerning > item.m_OriginX) {
                Ignorekerning = item.m_OriginX;
            }
        } else {
            Ignorekerning = 0;
            break;
        }
    }
    FX_FLOAT spacing = 0;
    for (; item_index < nItems; item_index ++) {
        CPDF_TextObjectItem item;
        pObj->GetItemInfo(item_index, &item);
        if (item.m_CharCode == (FX_DWORD) - 1) {
            CFX_WideString wstr = m_Buffer.GetWideString();
            if (wstr.IsEmpty() || wstr.GetAt(wstr.GetLength() - 1) == L' ') {
                continue;
            }
            FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH();
            spacing = -fontsize_h * (item.m_OriginX - Ignorekerning) / 1000;
            continue;
        }
        FX_FLOAT charSpace = pObj->m_TextState.GetObject()->m_CharSpace;
        if(nItems > 3 && !spacing) {
            charSpace = 0;
        }
        if((spacing || charSpace) && item_index > 0) {
            int last_width = 0;
            FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH();
            FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
            FX_FLOAT threshold = 0;
            if (space_charcode != -1) {
                threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000 ;
            }
            if(threshold > fontsize_h / 3) {
                threshold = 0;
            } else {
                threshold /= 2;
            }
            if (threshold == 0) {
                threshold = fontsize_h;
                int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont));
                threshold = this_width > last_width ? (FX_FLOAT)this_width : (FX_FLOAT)last_width;
                int nDivide = 6;
                if (threshold < 300) {
                    nDivide = 2;
                } else if (threshold < 500) {
                    nDivide = 4;
                } else if (threshold < 700) {
                    nDivide = 5;
                }
                threshold = threshold / nDivide;
                threshold = fontsize_h * threshold / 1000;
            }
            if(charSpace > 0.001) {
                spacing += matrix.TransformDistance(charSpace);
            } else if(charSpace < -0.001) {
                spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace));
            }
            if (threshold && (spacing && spacing >= threshold) ) {
                m_Buffer.AppendChar(L' ');
                if (m_pObjArray) {
                    m_pObjArray->Add(NULL);
                    m_pObjArray->Add(NULL);
                }
            }
            if (item.m_CharCode == (FX_DWORD) - 1) {
                continue;
            }
            spacing = 0;
        }
        CFX_WideString unicode_str = pFont->UnicodeFromCharCode(item.m_CharCode);
        if (unicode_str.IsEmpty()) {
            m_Buffer.AppendChar((FX_WCHAR)item.m_CharCode);
            if (m_pObjArray) {
                m_pObjArray->Add((void*)pObj);
                m_pObjArray->Add((void*)(FX_INTPTR)item_index);
            }
        } else {
            m_Buffer << unicode_str;
            if (m_pObjArray) {
                for (int i = 0; i < unicode_str.GetLength(); i ++) {
                    m_pObjArray->Add((void*)pObj);
                    m_pObjArray->Add((void*)(FX_INTPTR)item_index);
                }
            }
        }
    }
    return FALSE;
}