Beispiel #1
0
void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, CPDF_Document* pDoc, CPDF_Dictionary* pPage,
                             int iMinWidth, FX_DWORD flags)
{
    lines.RemoveAll();
    if (pPage == NULL) {
        return;
    }
    CPDF_Page page;
    page.Load(pDoc, pPage);
    CPDF_ParseOptions options;
    options.m_bTextOnly = TRUE;
    options.m_bSeparateForm = FALSE;
    page.ParseContent(&options);
    CFX_FloatRect page_bbox = page.GetPageBBox();
    if (flags & PDF2TXT_AUTO_ROTATE) {
        CheckRotate(page, page_bbox);
    }
    CTextPage texts;
    texts.m_bAutoWidth = flags & PDF2TXT_AUTO_WIDTH;
    texts.m_bKeepColumn = flags & PDF2TXT_KEEP_COLUMN;
    texts.m_bBreakSpace = TRUE;
    FX_POSITION pos = page.GetFirstObjectPosition();
    while (pos) {
        CPDF_PageObject* pObject = page.GetNextObject(pos);
        if (!(flags & PDF2TXT_INCLUDE_INVISIBLE)) {
            CFX_FloatRect rect(pObject->m_Left, pObject->m_Bottom, pObject->m_Right, pObject->m_Top);
            if (!page_bbox.Contains(rect)) {
                continue;
            }
        }
        texts.ProcessObject(pObject);
    }
    texts.WriteOutput(lines, iMinWidth);
}
Beispiel #2
0
FX_BOOL GetContentsRect( CPDF_Document * pDoc, CPDF_Dictionary* pDict, CPDF_RectArray * pRectArray )
{
	CPDF_Page* pPDFPage = FX_NEW CPDF_Page;
	pPDFPage->Load( pDoc, pDict, FALSE );
	pPDFPage->ParseContent();

	FX_POSITION pos = pPDFPage->GetFirstObjectPosition();
	
	while (pos)
	{
		CPDF_PageObject* pPageObject = pPDFPage->GetNextObject(pos);
		if (!pPageObject)continue;
		
		CPDF_Rect rc;
		rc.left = pPageObject->m_Left;
		rc.right = pPageObject->m_Right;
		rc.bottom = pPageObject->m_Bottom;
		rc.top = pPageObject->m_Top;
		
		if (IsValiableRect(rc, pDict->GetRect("MediaBox")))
		{
			pRectArray->Add(rc);
		}
	}
	
	delete pPDFPage;
	return TRUE;
}
Beispiel #3
0
FX_BOOL Document::getPageNumWords(IFXJS_Context* cc, const CJS_Parameters& params, CJS_Value& vRet, CFX_WideString& sError)
{
	ASSERT(m_pDocument != NULL);

	if (!m_pDocument->GetPermissions(FPDFPERM_EXTRACT_ACCESS)) return FALSE;

	int nPageNo = params.GetSize() > 0 ? params[0].ToInt() : 0;

	CPDF_Document* pDocument = m_pDocument->GetDocument();
	ASSERT(pDocument != NULL);

	CJS_Context* pContext = static_cast<CJS_Context*>(cc);
	if (nPageNo < 0 || nPageNo >= pDocument->GetPageCount())
	{
		sError = JSGetStringFromID(pContext, IDS_STRING_JSVALUEERROR);
		return FALSE;
	}

	CPDF_Dictionary* pPageDict = pDocument->GetPage(nPageNo);
	if (!pPageDict) return FALSE;

	CPDF_Page page;
	page.Load(pDocument, pPageDict);
	page.StartParse();
	page.ParseContent();

	FX_POSITION pos = page.GetFirstObjectPosition();

	int nWords = 0;

	while (pos)
	{
		if (CPDF_PageObject* pPageObj = page.GetNextObject(pos))
		{
			if (pPageObj->m_Type == PDFPAGE_TEXT)
			{
				CPDF_TextObject* pTextObj = (CPDF_TextObject*)pPageObj;
				nWords += CountWords(pTextObj);
			}
		}
	}

	vRet = nWords;

	return TRUE;
}
Beispiel #4
0
static void CheckRotate(CPDF_Page& page, CFX_FloatRect& page_bbox)
{
    int total_count = 0, rotated_count[3] = {0, 0, 0};
    FX_POSITION pos = page.GetFirstObjectPosition();
    while (pos) {
        CPDF_PageObject* pObj = page.GetNextObject(pos);
        if (pObj->m_Type != PDFPAGE_TEXT) {
            continue;
        }
        total_count ++;
        CPDF_TextObject* pText = (CPDF_TextObject*)pObj;
        FX_FLOAT angle = pText->m_TextState.GetBaselineAngle();
        if (angle == 0.0) {
            continue;
        }
        int degree = (int)(angle * 180 / PI + 0.5);
        if (degree % 90) {
            continue;
        }
        if (degree < 0) {
            degree += 360;
        }
        int index = degree / 90 % 3 - 1;
        if (index < 0) {
            continue;
        }
        rotated_count[index] ++;
    }
    if (total_count == 0) {
        return;
    }
    CFX_AffineMatrix matrix;
    if (rotated_count[0] > total_count * 2 / 3) {
        matrix.Set(0, -1, 1, 0, 0, page.GetPageHeight());
    } else if (rotated_count[1] > total_count * 2 / 3) {
        matrix.Set(-1, 0, 0, -1, page.GetPageWidth(), page.GetPageHeight());
    } else if (rotated_count[2] > total_count * 2 / 3) {
        matrix.Set(0, 1, -1, 0, page.GetPageWidth(), 0);
    } else {
        return;
    }
    page.Transform(matrix);
    page_bbox.Transform(&matrix);
}
Beispiel #5
0
CFX_WideString PDF_GetFirstTextLine_Unicode(CPDF_Document* pDoc, CPDF_Dictionary* pPage)
{
    CFX_WideTextBuf buffer;
    buffer.EstimateSize(0, 1024);
    CPDF_Page page;
    page.Load(pDoc, pPage);
    CPDF_ParseOptions options;
    options.m_bTextOnly = TRUE;
    options.m_bSeparateForm = FALSE;
    page.ParseContent(&options);
    CPDF_TextStream textstream(buffer, FALSE, NULL);
    FX_POSITION pos = page.GetFirstObjectPosition();
    while (pos) {
        CPDF_PageObject* pObject = page.GetNextObject(pos);
        if (pObject->m_Type != PDFPAGE_TEXT) {
            continue;
        }
        if (textstream.ProcessObject((CPDF_TextObject*)pObject, TRUE)) {
            break;
        }
    }
    return buffer.GetWideString();
}
Beispiel #6
0
FX_BOOL Document::getPageNthWord(IFXJS_Context* cc, const CJS_Parameters& params, CJS_Value& vRet, CFX_WideString& sError)
{
	ASSERT(m_pDocument != NULL);

	if (!m_pDocument->GetPermissions(FPDFPERM_EXTRACT_ACCESS)) return FALSE;

	int nPageNo = params.GetSize() > 0 ? params[0].ToInt() : 0;
	int nWordNo = params.GetSize() > 1 ? params[1].ToInt() : 0;
	bool bStrip = params.GetSize() > 2 ? params[2].ToBool() : true;

	CPDF_Document* pDocument = m_pDocument->GetDocument();
	if (!pDocument) return FALSE;

	CJS_Context* pContext = static_cast<CJS_Context*>(cc);
	if (nPageNo < 0 || nPageNo >= pDocument->GetPageCount())
	{
		sError = JSGetStringFromID(pContext, IDS_STRING_JSVALUEERROR);
		return FALSE;
	}

	CPDF_Dictionary* pPageDict = pDocument->GetPage(nPageNo);
	if (!pPageDict) return FALSE;

	CPDF_Page page;
	page.Load(pDocument, pPageDict);
	page.StartParse();
	page.ParseContent();

	FX_POSITION pos = page.GetFirstObjectPosition();

	int nWords = 0;

	CFX_WideString swRet;

	while (pos)
	{
		if (CPDF_PageObject* pPageObj = page.GetNextObject(pos))
		{
			if (pPageObj->m_Type == PDFPAGE_TEXT)
			{
				int nObjWords = CountWords((CPDF_TextObject*)pPageObj);

				if (nWords + nObjWords >= nWordNo)
				{
					swRet = GetObjWordStr((CPDF_TextObject*)pPageObj, nWordNo - nWords);
					break;
				}

				nWords += nObjWords;
			}
		}
	}

	if (bStrip)
	{
		swRet.TrimLeft();
		swRet.TrimRight();
	}

	vRet = swRet.c_str();
	return TRUE;
}