Example #1
0
FPDFText_MarkedContent CPDF_TextPage::PreMarkedContent(PDFTEXT_Obj Obj) {
  CPDF_TextObject* pTextObj = Obj.m_pTextObj.Get();
  size_t nContentMarks = pTextObj->m_ContentMarks.CountItems();
  if (nContentMarks == 0)
    return FPDFText_MarkedContent::Pass;

  WideString actText;
  bool bExist = false;
  const CPDF_Dictionary* pDict = nullptr;
  for (size_t i = 0; i < nContentMarks; ++i) {
    const CPDF_ContentMarkItem* item = pTextObj->m_ContentMarks.GetItem(i);
    pDict = item->GetParam();
    if (!pDict)
      continue;
    const CPDF_String* temp = ToString(pDict->GetObjectFor("ActualText"));
    if (temp) {
      bExist = true;
      actText = temp->GetUnicodeText();
    }
  }
  if (!bExist)
    return FPDFText_MarkedContent::Pass;

  if (m_pPreTextObj) {
    const CPDF_ContentMarks& marks = m_pPreTextObj->m_ContentMarks;
    if (marks.CountItems() == nContentMarks &&
        marks.GetItem(nContentMarks - 1)->GetParam() == pDict) {
      return FPDFText_MarkedContent::Done;
    }
  }

  if (actText.IsEmpty())
    return FPDFText_MarkedContent::Pass;

  CPDF_Font* pFont = pTextObj->GetFont();
  bExist = false;
  for (size_t i = 0; i < actText.GetLength(); ++i) {
    if (pFont->CharCodeFromUnicode(actText[i]) != CPDF_Font::kInvalidCharCode) {
      bExist = true;
      break;
    }
  }
  if (!bExist)
    return FPDFText_MarkedContent::Pass;

  bExist = false;
  for (size_t i = 0; i < actText.GetLength(); ++i) {
    wchar_t wChar = actText[i];
    if ((wChar > 0x80 && wChar < 0xFFFD) || (wChar <= 0x80 && isprint(wChar))) {
      bExist = true;
      break;
    }
  }
  if (!bExist)
    return FPDFText_MarkedContent::Done;

  return FPDFText_MarkedContent::Delay;
}
Example #2
0
void CPDF_TextPage::ProcessMarkedContent(PDFTEXT_Obj Obj) {
  CPDF_TextObject* pTextObj = Obj.m_pTextObj.Get();

  size_t nContentMarks = pTextObj->m_ContentMarks.CountItems();
  if (nContentMarks == 0)
    return;

  WideString actText;
  for (size_t n = 0; n < nContentMarks; ++n) {
    const CPDF_ContentMarkItem* item = pTextObj->m_ContentMarks.GetItem(n);
    const CPDF_Dictionary* pDict = item->GetParam();
    if (pDict)
      actText = pDict->GetUnicodeTextFor("ActualText");
  }
  if (actText.IsEmpty())
    return;

  CPDF_Font* pFont = pTextObj->GetFont();
  CFX_Matrix matrix = pTextObj->GetTextMatrix() * Obj.m_formMatrix;

  for (size_t k = 0; k < actText.GetLength(); ++k) {
    wchar_t wChar = actText[k];
    if (wChar <= 0x80 && !isprint(wChar))
      wChar = 0x20;
    if (wChar >= 0xFFFD)
      continue;

    PAGECHAR_INFO charinfo;
    charinfo.m_Origin = pTextObj->GetPos();
    charinfo.m_Index = m_TextBuf.GetLength();
    charinfo.m_Unicode = wChar;
    charinfo.m_CharCode = pFont->CharCodeFromUnicode(wChar);
    charinfo.m_Flag = FPDFTEXT_CHAR_PIECE;
    charinfo.m_pTextObj = pTextObj;
    charinfo.m_CharBox = pTextObj->GetRect();
    charinfo.m_Matrix = matrix;
    m_TempTextBuf.AppendChar(wChar);
    m_TempCharList.push_back(charinfo);
  }
}
Example #3
0
void CTextPage::ProcessObject(CPDF_PageObject* pObject)
{
    if (pObject->m_Type != PDFPAGE_TEXT) {
        return;
    }
    CPDF_TextObject* pText = (CPDF_TextObject*)pObject;
    CPDF_Font* pFont = pText->m_TextState.GetFont();
    int count = pText->CountItems();
    FX_FLOAT* pPosArray = FX_Alloc(FX_FLOAT, count * 2);
    if (pPosArray) {
        pText->CalcCharPos(pPosArray);
    }
    FX_FLOAT fontsize_h = pText->m_TextState.GetFontSizeH();
    FX_FLOAT fontsize_v = pText->m_TextState.GetFontSizeV();
    FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
    FX_FLOAT spacew = 0;
    if (space_charcode != -1) {
        spacew = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
    }
    if (spacew == 0) {
        spacew = fontsize_h / 4;
    }
    if (pText->m_TextState.GetBaselineAngle() != 0) {
        int cc = 0;
        CFX_AffineMatrix matrix;
        pText->GetTextMatrix(&matrix);
        for (int i = 0; i < pText->m_nChars; i ++) {
            FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pText->m_pCharCodes : pText->m_pCharCodes[i];
            if (charcode == (FX_DWORD) - 1) {
                continue;
            }
            FX_RECT char_box;
            pFont->GetCharBBox(charcode, char_box);
            FX_FLOAT char_left = pPosArray ? pPosArray[cc * 2] : char_box.left * pText->m_TextState.GetFontSize() / 1000;
            FX_FLOAT char_right = pPosArray ? pPosArray[cc * 2 + 1] : char_box.right * pText->m_TextState.GetFontSize() / 1000;
            FX_FLOAT char_top = char_box.top * pText->m_TextState.GetFontSize() / 1000;
            FX_FLOAT char_bottom = char_box.bottom * pText->m_TextState.GetFontSize() / 1000;
            cc ++;
            FX_FLOAT char_origx, char_origy;
            matrix.Transform(char_left, 0, char_origx, char_origy);
            matrix.TransformRect(char_left, char_right, char_top, char_bottom);
            CFX_ByteString str;
            pFont->AppendChar(str, charcode);
            InsertTextBox(NULL, char_origy, char_left, char_right, char_top,
                          char_bottom, spacew, fontsize_v, str, pFont);
        }
        if (pPosArray) {
            FX_Free(pPosArray);
        }
        return;
    }
    FX_FLOAT ratio_h = fontsize_h / pText->m_TextState.GetFontSize();
    for (int ii = 0; ii < count * 2; ii ++) {
        pPosArray[ii] *= ratio_h;
    }
    FX_FLOAT baseline = pText->m_PosY;
    CTextBaseLine* pBaseLine = NULL;
    FX_FLOAT topy = pText->m_Top;
    FX_FLOAT bottomy = pText->m_Bottom;
    FX_FLOAT leftx = pText->m_Left;
    int cc = 0;
    CFX_ByteString segment;
    int space_count = 0;
    FX_FLOAT last_left = 0, last_right = 0, segment_left = 0, segment_right = 0;
    for (int i = 0; i < pText->m_nChars; i ++) {
        FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pText->m_pCharCodes : pText->m_pCharCodes[i];
        if (charcode == (FX_DWORD) - 1) {
            continue;
        }
        FX_FLOAT char_left = pPosArray[cc * 2];
        FX_FLOAT char_right = pPosArray[cc * 2 + 1];
        cc ++;
        if (char_left < last_left || (char_left - last_right) > spacew / 2) {
            pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right,
                                      topy, bottomy, spacew, fontsize_v, segment, pFont);
            segment_left = char_left;
            segment = "";
        }
        if (space_count > 1) {
            pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right,
                                      topy, bottomy, spacew, fontsize_v, segment, pFont);
            segment = "";
        } else if (space_count == 1) {
            pFont->AppendChar(segment, ' ');
        }
        if (segment.GetLength() == 0) {
            segment_left = char_left;
        }
        segment_right = char_right;
        pFont->AppendChar(segment, charcode);
        space_count = 0;
        last_left = char_left;
        last_right = char_right;
    }
    if (segment.GetLength())
        pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right,
                                  topy, bottomy, spacew, fontsize_v, segment, pFont);
    FX_Free(pPosArray);
}
Example #4
0
void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj) {
  CPDF_TextObject* pTextObj = Obj.m_pTextObj.Get();
  if (fabs(pTextObj->GetRect().Width()) < kSizeEpsilon)
    return;
  CFX_Matrix formMatrix = Obj.m_formMatrix;
  CPDF_Font* pFont = pTextObj->GetFont();
  CFX_Matrix matrix = pTextObj->GetTextMatrix() * formMatrix;

  FPDFText_MarkedContent ePreMKC = PreMarkedContent(Obj);
  if (ePreMKC == FPDFText_MarkedContent::Done) {
    m_pPreTextObj = pTextObj;
    m_perMatrix = formMatrix;
    return;
  }
  GenerateCharacter result = GenerateCharacter::None;
  if (m_pPreTextObj) {
    result = ProcessInsertObject(pTextObj, formMatrix);
    if (result == GenerateCharacter::LineBreak)
      m_CurlineRect = Obj.m_pTextObj->GetRect();
    else
      m_CurlineRect.Union(Obj.m_pTextObj->GetRect());

    switch (result) {
      case GenerateCharacter::None:
        break;
      case GenerateCharacter::Space: {
        Optional<PAGECHAR_INFO> pGenerateChar =
            GenerateCharInfo(TEXT_SPACE_CHAR);
        if (pGenerateChar) {
          if (!formMatrix.IsIdentity())
            pGenerateChar->m_Matrix = formMatrix;
          m_TempTextBuf.AppendChar(TEXT_SPACE_CHAR);
          m_TempCharList.push_back(*pGenerateChar);
        }
        break;
      }
      case GenerateCharacter::LineBreak:
        CloseTempLine();
        if (m_TextBuf.GetSize()) {
          AppendGeneratedCharacter(TEXT_RETURN_CHAR, formMatrix);
          AppendGeneratedCharacter(TEXT_LINEFEED_CHAR, formMatrix);
        }
        break;
      case GenerateCharacter::Hyphen:
        if (pTextObj->CountChars() == 1) {
          CPDF_TextObjectItem item;
          pTextObj->GetCharInfo(0, &item);
          WideString wstrItem =
              pTextObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
          if (wstrItem.IsEmpty())
            wstrItem += (wchar_t)item.m_CharCode;
          wchar_t curChar = wstrItem[0];
          if (IsHyphenCode(curChar))
            return;
        }
        while (m_TempTextBuf.GetSize() > 0 &&
               m_TempTextBuf.AsStringView()[m_TempTextBuf.GetLength() - 1] ==
                   0x20) {
          m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
          m_TempCharList.pop_back();
        }
        PAGECHAR_INFO* charinfo = &m_TempCharList.back();
        m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
        charinfo->m_Unicode = 0x2;
        charinfo->m_Flag = FPDFTEXT_CHAR_HYPHEN;
        m_TempTextBuf.AppendChar(0xfffe);
        break;
    }
  } else {
    m_CurlineRect = Obj.m_pTextObj->GetRect();
  }

  if (ePreMKC == FPDFText_MarkedContent::Delay) {
    ProcessMarkedContent(Obj);
    m_pPreTextObj = pTextObj;
    m_perMatrix = formMatrix;
    return;
  }
  m_pPreTextObj = pTextObj;
  m_perMatrix = formMatrix;
  float baseSpace = CalculateBaseSpace(pTextObj, matrix);

  const bool bR2L = IsRightToLeft(*pTextObj, *pFont);
  const bool bIsBidiAndMirrorInverse =
      bR2L && (matrix.a * matrix.d - matrix.b * matrix.c) < 0;
  int32_t iBufStartAppend = m_TempTextBuf.GetLength();
  int32_t iCharListStartAppend =
      pdfium::CollectionSize<int32_t>(m_TempCharList);

  float spacing = 0;
  const size_t nItems = pTextObj->CountItems();
  for (size_t i = 0; i < nItems; ++i) {
    CPDF_TextObjectItem item;
    PAGECHAR_INFO charinfo;
    pTextObj->GetItemInfo(i, &item);
    if (item.m_CharCode == static_cast<uint32_t>(-1)) {
      WideString str = m_TempTextBuf.MakeString();
      if (str.IsEmpty())
        str = m_TextBuf.AsStringView();
      if (str.IsEmpty() || str[str.GetLength() - 1] == TEXT_SPACE_CHAR)
        continue;

      float fontsize_h = pTextObj->m_TextState.GetFontSizeH();
      spacing = -fontsize_h * item.m_Origin.x / 1000;
      continue;
    }
    float charSpace = pTextObj->m_TextState.GetCharSpace();
    if (charSpace > 0.001)
      spacing += matrix.TransformDistance(charSpace);
    else if (charSpace < -0.001)
      spacing -= matrix.TransformDistance(fabs(charSpace));
    spacing -= baseSpace;
    if (spacing && i > 0) {
      float fontsize_h = pTextObj->m_TextState.GetFontSizeH();
      uint32_t space_charcode = pFont->CharCodeFromUnicode(' ');
      float threshold = 0;
      if (space_charcode != CPDF_Font::kInvalidCharCode)
        threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
      if (threshold > fontsize_h / 3)
        threshold = 0;
      else
        threshold /= 2;
      if (threshold == 0) {
        threshold = static_cast<float>(GetCharWidth(item.m_CharCode, pFont));
        threshold = NormalizeThreshold(threshold, 300, 500, 700);
        threshold = fontsize_h * threshold / 1000;
      }
      if (threshold && (spacing && spacing >= threshold)) {
        charinfo.m_Unicode = TEXT_SPACE_CHAR;
        charinfo.m_Flag = FPDFTEXT_CHAR_GENERATED;
        charinfo.m_pTextObj = pTextObj;
        charinfo.m_Index = m_TextBuf.GetLength();
        m_TempTextBuf.AppendChar(TEXT_SPACE_CHAR);
        charinfo.m_CharCode = CPDF_Font::kInvalidCharCode;
        charinfo.m_Matrix = formMatrix;
        charinfo.m_Origin = matrix.Transform(item.m_Origin);
        charinfo.m_CharBox =
            CFX_FloatRect(charinfo.m_Origin.x, charinfo.m_Origin.y,
                          charinfo.m_Origin.x, charinfo.m_Origin.y);
        m_TempCharList.push_back(charinfo);
      }
      if (item.m_CharCode == CPDF_Font::kInvalidCharCode)
        continue;
    }
    spacing = 0;
    WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode);
    bool bNoUnicode = false;
    if (wstrItem.IsEmpty() && item.m_CharCode) {
      wstrItem += static_cast<wchar_t>(item.m_CharCode);
      bNoUnicode = true;
    }
    charinfo.m_Index = -1;
    charinfo.m_CharCode = item.m_CharCode;
    charinfo.m_Flag =
        bNoUnicode ? FPDFTEXT_CHAR_UNUNICODE : FPDFTEXT_CHAR_NORMAL;
    charinfo.m_pTextObj = pTextObj;
    charinfo.m_Origin = matrix.Transform(item.m_Origin);

    const FX_RECT rect =
        charinfo.m_pTextObj->GetFont()->GetCharBBox(charinfo.m_CharCode);
    const float fFontSize = pTextObj->GetFontSize() / 1000;
    charinfo.m_CharBox.top = rect.top * fFontSize + item.m_Origin.y;
    charinfo.m_CharBox.left = rect.left * fFontSize + item.m_Origin.x;
    charinfo.m_CharBox.right = rect.right * fFontSize + item.m_Origin.x;
    charinfo.m_CharBox.bottom = rect.bottom * fFontSize + item.m_Origin.y;
    if (fabsf(charinfo.m_CharBox.top - charinfo.m_CharBox.bottom) <
        kSizeEpsilon) {
      charinfo.m_CharBox.top =
          charinfo.m_CharBox.bottom + pTextObj->GetFontSize();
    }
    if (fabsf(charinfo.m_CharBox.right - charinfo.m_CharBox.left) <
        kSizeEpsilon) {
      charinfo.m_CharBox.right =
          charinfo.m_CharBox.left + pTextObj->GetCharWidth(charinfo.m_CharCode);
    }
    charinfo.m_CharBox = matrix.TransformRect(charinfo.m_CharBox);
    charinfo.m_Matrix = matrix;
    if (wstrItem.IsEmpty()) {
      charinfo.m_Unicode = 0;
      m_TempCharList.push_back(charinfo);
      m_TempTextBuf.AppendChar(0xfffe);
      continue;
    }
    int nTotal = wstrItem.GetLength();
    bool bDel = false;
    const int count = std::min(pdfium::CollectionSize<int>(m_TempCharList), 7);
    float threshold = charinfo.m_Matrix.TransformXDistance(
        static_cast<float>(TEXT_CHARRATIO_GAPDELTA) * pTextObj->GetFontSize());
    for (int n = pdfium::CollectionSize<int>(m_TempCharList);
         n > pdfium::CollectionSize<int>(m_TempCharList) - count; --n) {
      const PAGECHAR_INFO& charinfo1 = m_TempCharList[n - 1];
      CFX_PointF diff = charinfo1.m_Origin - charinfo.m_Origin;
      if (charinfo1.m_CharCode == charinfo.m_CharCode &&
          charinfo1.m_pTextObj->GetFont() == charinfo.m_pTextObj->GetFont() &&
          fabs(diff.x) < threshold && fabs(diff.y) < threshold) {
        bDel = true;
        break;
      }
    }
    if (!bDel) {
      for (int nIndex = 0; nIndex < nTotal; ++nIndex) {
        charinfo.m_Unicode = wstrItem[nIndex];
        if (charinfo.m_Unicode) {
          charinfo.m_Index = m_TextBuf.GetLength();
          m_TempTextBuf.AppendChar(charinfo.m_Unicode);
        } else {
          m_TempTextBuf.AppendChar(0xfffe);
        }
        m_TempCharList.push_back(charinfo);
      }
    } else if (i == 0) {
      WideString str = m_TempTextBuf.MakeString();
      if (!str.IsEmpty() && str[str.GetLength() - 1] == TEXT_SPACE_CHAR) {
        m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
        m_TempCharList.pop_back();
      }
    }
  }
  if (bIsBidiAndMirrorInverse)
    SwapTempTextBuf(iCharListStartAppend, iBufStartAppend);
}
Example #5
0
FX_BOOL CPDF_TextStream::ProcessObject(const CPDF_TextObject* pObj, FX_BOOL bFirstLine)
{
    CPDF_Font* pFont = pObj->GetFont();
    CFX_AffineMatrix matrix;
    pObj->GetTextMatrix(&matrix);
    int item_index = 0;
    if (m_pLastObj) {
        int result = FPDFText_ProcessInterObj(m_pLastObj, pObj);
        if (result == 2) {
            int len = m_Buffer.GetLength();
            if (len && m_bUseLF && m_Buffer.GetBuffer()[len - 1] == L'-') {
                m_Buffer.Delete(len - 1, 1);
                if (m_pObjArray) {
                    m_pObjArray->RemoveAt((len - 1) * 2, 2);
                }
            } else {
                if (bFirstLine) {
                    return TRUE;
                }
                if (m_bUseLF) {
                    m_Buffer.AppendChar(L'\r');
                    m_Buffer.AppendChar(L'\n');
                    if (m_pObjArray) {
                        for (int i = 0; i < 4; i ++) {
                            m_pObjArray->Add(NULL);
                        }
                    }
                } else {
                    m_Buffer.AppendChar(' ');
                    if (m_pObjArray) {
                        m_pObjArray->Add(NULL);
                        m_pObjArray->Add(NULL);
                    }
                }
            }
        } else if (result == 1) {
            m_Buffer.AppendChar(L' ');
            if (m_pObjArray) {
                m_pObjArray->Add(NULL);
                m_pObjArray->Add(NULL);
            }
        } else if (result == -1) {
            m_pLastObj = pObj;
            return FALSE;
        } else if (result == 3) {
            item_index = 1;
        }
    }
    m_pLastObj = pObj;
    int nItems = pObj->CountItems();
    FX_FLOAT Ignorekerning = 0;
    for(int i = 1; i < nItems - 1; i += 2) {
        CPDF_TextObjectItem item;
        pObj->GetItemInfo(i, &item);
        if (item.m_CharCode == (FX_DWORD) - 1) {
            if(i == 1) {
                Ignorekerning = item.m_OriginX;
            } else if(Ignorekerning > item.m_OriginX) {
                Ignorekerning = item.m_OriginX;
            }
        } else {
            Ignorekerning = 0;
            break;
        }
    }
    FX_FLOAT spacing = 0;
    for (; item_index < nItems; item_index ++) {
        CPDF_TextObjectItem item;
        pObj->GetItemInfo(item_index, &item);
        if (item.m_CharCode == (FX_DWORD) - 1) {
            CFX_WideString wstr = m_Buffer.GetWideString();
            if (wstr.IsEmpty() || wstr.GetAt(wstr.GetLength() - 1) == L' ') {
                continue;
            }
            FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH();
            spacing = -fontsize_h * (item.m_OriginX - Ignorekerning) / 1000;
            continue;
        }
        FX_FLOAT charSpace = pObj->m_TextState.GetObject()->m_CharSpace;
        if(nItems > 3 && !spacing) {
            charSpace = 0;
        }
        if((spacing || charSpace) && item_index > 0) {
            int last_width = 0;
            FX_FLOAT fontsize_h = pObj->m_TextState.GetFontSizeH();
            FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
            FX_FLOAT threshold = 0;
            if (space_charcode != -1) {
                threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000 ;
            }
            if(threshold > fontsize_h / 3) {
                threshold = 0;
            } else {
                threshold /= 2;
            }
            if (threshold == 0) {
                threshold = fontsize_h;
                int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont));
                threshold = this_width > last_width ? (FX_FLOAT)this_width : (FX_FLOAT)last_width;
                int nDivide = 6;
                if (threshold < 300) {
                    nDivide = 2;
                } else if (threshold < 500) {
                    nDivide = 4;
                } else if (threshold < 700) {
                    nDivide = 5;
                }
                threshold = threshold / nDivide;
                threshold = fontsize_h * threshold / 1000;
            }
            if(charSpace > 0.001) {
                spacing += matrix.TransformDistance(charSpace);
            } else if(charSpace < -0.001) {
                spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace));
            }
            if (threshold && (spacing && spacing >= threshold) ) {
                m_Buffer.AppendChar(L' ');
                if (m_pObjArray) {
                    m_pObjArray->Add(NULL);
                    m_pObjArray->Add(NULL);
                }
            }
            if (item.m_CharCode == (FX_DWORD) - 1) {
                continue;
            }
            spacing = 0;
        }
        CFX_WideString unicode_str = pFont->UnicodeFromCharCode(item.m_CharCode);
        if (unicode_str.IsEmpty()) {
            m_Buffer.AppendChar((FX_WCHAR)item.m_CharCode);
            if (m_pObjArray) {
                m_pObjArray->Add((void*)pObj);
                m_pObjArray->Add((void*)(FX_INTPTR)item_index);
            }
        } else {
            m_Buffer << unicode_str;
            if (m_pObjArray) {
                for (int i = 0; i < unicode_str.GetLength(); i ++) {
                    m_pObjArray->Add((void*)pObj);
                    m_pObjArray->Add((void*)(FX_INTPTR)item_index);
                }
            }
        }
    }
    return FALSE;
}