void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage, int iMinWidth, FX_DWORD flags) { lines.RemoveAll(); if (pPage == NULL) { return; } CPDF_Page page; page.Load(pDoc, pPage); CPDF_ParseOptions options; options.m_bTextOnly = TRUE; options.m_bSeparateForm = FALSE; page.ParseContent(&options); CFX_FloatRect page_bbox = page.GetPageBBox(); if (flags & PDF2TXT_AUTO_ROTATE) { CheckRotate(page, page_bbox); } CTextPage texts; texts.m_bAutoWidth = flags & PDF2TXT_AUTO_WIDTH; texts.m_bKeepColumn = flags & PDF2TXT_KEEP_COLUMN; texts.m_bBreakSpace = TRUE; FX_POSITION pos = page.GetFirstObjectPosition(); while (pos) { CPDF_PageObject* pObject = page.GetNextObject(pos); if (!(flags & PDF2TXT_INCLUDE_INVISIBLE)) { CFX_FloatRect rect(pObject->m_Left, pObject->m_Bottom, pObject->m_Right, pObject->m_Top); if (!page_bbox.Contains(rect)) { continue; } } texts.ProcessObject(pObject); } texts.WriteOutput(lines, iMinWidth); }
FX_BOOL GetContentsRect( CPDF_Document * pDoc, CPDF_Dictionary* pDict, CPDF_RectArray * pRectArray ) { CPDF_Page* pPDFPage = FX_NEW CPDF_Page; pPDFPage->Load( pDoc, pDict, FALSE ); pPDFPage->ParseContent(); FX_POSITION pos = pPDFPage->GetFirstObjectPosition(); while (pos) { CPDF_PageObject* pPageObject = pPDFPage->GetNextObject(pos); if (!pPageObject)continue; CPDF_Rect rc; rc.left = pPageObject->m_Left; rc.right = pPageObject->m_Right; rc.bottom = pPageObject->m_Bottom; rc.top = pPageObject->m_Top; if (IsValiableRect(rc, pDict->GetRect("MediaBox"))) { pRectArray->Add(rc); } } delete pPDFPage; return TRUE; }
FX_BOOL Document::getPageNumWords(IFXJS_Context* cc, const CJS_Parameters& params, CJS_Value& vRet, CFX_WideString& sError) { ASSERT(m_pDocument != NULL); if (!m_pDocument->GetPermissions(FPDFPERM_EXTRACT_ACCESS)) return FALSE; int nPageNo = params.GetSize() > 0 ? params[0].ToInt() : 0; CPDF_Document* pDocument = m_pDocument->GetDocument(); ASSERT(pDocument != NULL); CJS_Context* pContext = static_cast<CJS_Context*>(cc); if (nPageNo < 0 || nPageNo >= pDocument->GetPageCount()) { sError = JSGetStringFromID(pContext, IDS_STRING_JSVALUEERROR); return FALSE; } CPDF_Dictionary* pPageDict = pDocument->GetPage(nPageNo); if (!pPageDict) return FALSE; CPDF_Page page; page.Load(pDocument, pPageDict); page.StartParse(); page.ParseContent(); FX_POSITION pos = page.GetFirstObjectPosition(); int nWords = 0; while (pos) { if (CPDF_PageObject* pPageObj = page.GetNextObject(pos)) { if (pPageObj->m_Type == PDFPAGE_TEXT) { CPDF_TextObject* pTextObj = (CPDF_TextObject*)pPageObj; nWords += CountWords(pTextObj); } } } vRet = nWords; return TRUE; }
static void CheckRotate(CPDF_Page& page, CFX_FloatRect& page_bbox) { int total_count = 0, rotated_count[3] = {0, 0, 0}; FX_POSITION pos = page.GetFirstObjectPosition(); while (pos) { CPDF_PageObject* pObj = page.GetNextObject(pos); if (pObj->m_Type != PDFPAGE_TEXT) { continue; } total_count ++; CPDF_TextObject* pText = (CPDF_TextObject*)pObj; FX_FLOAT angle = pText->m_TextState.GetBaselineAngle(); if (angle == 0.0) { continue; } int degree = (int)(angle * 180 / PI + 0.5); if (degree % 90) { continue; } if (degree < 0) { degree += 360; } int index = degree / 90 % 3 - 1; if (index < 0) { continue; } rotated_count[index] ++; } if (total_count == 0) { return; } CFX_AffineMatrix matrix; if (rotated_count[0] > total_count * 2 / 3) { matrix.Set(0, -1, 1, 0, 0, page.GetPageHeight()); } else if (rotated_count[1] > total_count * 2 / 3) { matrix.Set(-1, 0, 0, -1, page.GetPageWidth(), page.GetPageHeight()); } else if (rotated_count[2] > total_count * 2 / 3) { matrix.Set(0, 1, -1, 0, page.GetPageWidth(), 0); } else { return; } page.Transform(matrix); page_bbox.Transform(&matrix); }
CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc, CPDF_Dictionary* pPage) { CFX_WideTextBuf buffer; buffer.EstimateSize(0, 1024); CPDF_Page page; page.Load(pDoc, pPage); CPDF_ParseOptions options; options.m_bTextOnly = TRUE; options.m_bSeparateForm = FALSE; page.ParseContent(&options); CPDF_TextStream textstream(buffer, FALSE, NULL); FX_POSITION pos = page.GetFirstObjectPosition(); while (pos) { CPDF_PageObject* pObject = page.GetNextObject(pos); if (pObject->m_Type != PDFPAGE_TEXT) { continue; } if (textstream.ProcessObject((CPDF_TextObject*)pObject, TRUE)) { break; } } return buffer.GetWideString(); }
FX_BOOL Document::getPageNthWord(IFXJS_Context* cc, const CJS_Parameters& params, CJS_Value& vRet, CFX_WideString& sError) { ASSERT(m_pDocument != NULL); if (!m_pDocument->GetPermissions(FPDFPERM_EXTRACT_ACCESS)) return FALSE; int nPageNo = params.GetSize() > 0 ? params[0].ToInt() : 0; int nWordNo = params.GetSize() > 1 ? params[1].ToInt() : 0; bool bStrip = params.GetSize() > 2 ? params[2].ToBool() : true; CPDF_Document* pDocument = m_pDocument->GetDocument(); if (!pDocument) return FALSE; CJS_Context* pContext = static_cast<CJS_Context*>(cc); if (nPageNo < 0 || nPageNo >= pDocument->GetPageCount()) { sError = JSGetStringFromID(pContext, IDS_STRING_JSVALUEERROR); return FALSE; } CPDF_Dictionary* pPageDict = pDocument->GetPage(nPageNo); if (!pPageDict) return FALSE; CPDF_Page page; page.Load(pDocument, pPageDict); page.StartParse(); page.ParseContent(); FX_POSITION pos = page.GetFirstObjectPosition(); int nWords = 0; CFX_WideString swRet; while (pos) { if (CPDF_PageObject* pPageObj = page.GetNextObject(pos)) { if (pPageObj->m_Type == PDFPAGE_TEXT) { int nObjWords = CountWords((CPDF_TextObject*)pPageObj); if (nWords + nObjWords >= nWordNo) { swRet = GetObjWordStr((CPDF_TextObject*)pPageObj, nWordNo - nWords); break; } nWords += nObjWords; } } } if (bStrip) { swRet.TrimLeft(); swRet.TrimRight(); } vRet = swRet.c_str(); return TRUE; }