Exemple #1
0
JNIEXPORT void JNICALL Java_com_foolabs_xpdf_PDFPage__1getText
	  (JNIEnv *env, jobject obj, jobject document, jobject javaCollector,
			  jboolean physicalLayout, jdouble fixedPitch, jboolean rawOrder) {
	Page *page = getHandle<Page>(env, obj);
	PDFDoc *doc = getHandle<PDFDoc>(env, document);

	TextCollector *collector = new TextCollector(env, javaCollector);

	GBool gPhysicalLayout = physicalLayout ? gTrue : gFalse;
	GBool gRawOrder = rawOrder ? gTrue : gFalse;

	TextOutputDev *outputDevice =
			new TextOutputDev(&TextCollector::CollectText, collector, gPhysicalLayout, gRawOrder);

	if (outputDevice->isOk()) {
		const double hDPI = 72;
		const double vDPI = 72;
		const int rotate = 0;
		const GBool useMediaBox = gFalse;
		const GBool crop = gTrue;
		const GBool printing = gFalse;
		Catalog *catalog = doc->getCatalog();

		page->display(outputDevice, hDPI, vDPI, rotate, useMediaBox, crop,
				printing, catalog);
	}

	delete collector; // All text should already be in Java
	delete outputDevice;
}
void XojPopplerPage::initTextPage()
{
    XOJ_CHECK_TYPE(XojPopplerPage);

    g_mutex_lock(&this->renderMutex);

    if (this->text == NULL)
    {
        g_mutex_lock(this->docMutex);
        TextOutputDev *textDev = new TextOutputDev(NULL, true, /* TODO POPPLER : Check value */ 0, false, false);

        Gfx *gfx = this->page->createGfx(textDev, 72.0, 72.0, 0, false, /* useMediaBox */ true, /* Crop */
                                         -1, -1, -1, -1, false, /* printing */ NULL, NULL);

        this->page->display(gfx);
        textDev->endPage();

        this->text = textDev->takeText();
        delete gfx;
        delete textDev;
        g_mutex_unlock(this->docMutex);
    }

    g_mutex_unlock(&this->renderMutex);
}
void NSRPopplerDocument::renderPage(int page)
{
	double dpix, dpiy;

	if (_doc == NULL || page > getNumberOfPages() || page < 1)
		return;

	_page = _catalog->getPage(page);

	if (isTextOnly()) {
		PDFRectangle	*rect;
		GooString	*text;
		TextOutputDev	*dev;

		dev = new TextOutputDev (0, gFalse, gFalse, gFalse);

		_doc->displayPageSlice(dev, _page->getNum(), 72, 72, 0, gFalse, gTrue, gFalse, -1, -1, -1, -1);

		rect = _page->getCropBox();
		text = dev->getText(rect->x1, rect->y1, rect->x2, rect->y2);
		_text = processText(QString::fromUtf8(text->getCString()));

		delete text;
		delete dev;
		_readyForLoad = true;

		return;
	}

	if (isZoomToWidth()) {
		double wZoom = ((double) getScreenWidth() / (double) _page->getCropWidth() * 100.0);
		setZoomSilent((int) wZoom);
	}

	if (getZoom() > getMaxZoom())
		setZoomSilent (getMaxZoom());
	else if (getZoom() < getMinZoom())
		setZoomSilent (getMinZoom());

	if (_readyForLoad)
		_dev->startPage(0, NULL);

	dpix = _dpix * getZoom() / 100.0;
	dpiy = _dpiy * getZoom() / 100.0;

	_page->display(_dev, dpix, dpiy, getRotation(), gFalse, gFalse, gTrue, _catalog);

	_readyForLoad = true;
}
indri::parse::UnparsedDocument* indri::parse::PDFDocumentExtractor::nextDocument() {
  if( !_documentPath.length() )
    return 0;

  PDFDoc* doc = 0;
  TextOutputDev* textOut = 0;
  GString* gfilename = new GString(_documentPath.c_str());
  doc = new PDFDoc( gfilename );
  // if the doc is not ok, or ok to copy, it
  // will be a document of length 0.
  if( doc->isOk() && doc->okToCopy() ) {
    void* stream = &_documentTextBuffer;
    textOut = new TextOutputDev( buffer_write, stream, gFalse, gFalse);
    if ( textOut->isOk() ) {
      int firstPage = 1;
      int lastPage = doc->getNumPages();
	  double hDPI=72.0;
	  double vDPI=72.0;
	  int rotate=0;
	  GBool useMediaBox=gFalse;
	  GBool crop=gTrue; 
	  GBool printing=gFalse; 
	  if(doc->readMetadata()!=NULL)
	  {
		  GString rawMetaData = doc->readMetadata();
		  GString preparedMetaData="";

		  //zoek <rdf:RDF  en eindig bij </rdf:RDF>!! 
		  for(int x=0; x<rawMetaData.getLength(); x++) {
			  if(rawMetaData.getChar(x)!='?' && rawMetaData.getChar(x)!=':') {
				  //skip characters which the XMLReader doesn't understand
				  preparedMetaData.append(rawMetaData.getChar(x));
			  }
		  }
		  std::string metaData(preparedMetaData.getCString());
		  int startbegin = metaData.find("<rdf");
		  int stopend = metaData.find(">", metaData.rfind("</rdf") );
		  metaData = metaData.substr(startbegin, (stopend-startbegin)+1 );
	  

     	  indri::xml::XMLReader reader;

		  try {
			  std::auto_ptr<indri::xml::XMLNode> result( reader.read( metaData.c_str() ) );
			  appendPdfMetaData( result.get() );
		  } catch( lemur::api::Exception& e ) {
			LEMUR_RETHROW( e, "Had trouble reading PDF metadata" );
		  } 
		  if( _author.length()>0 || _title.length()>0 )
		  {
			std::string createdPdfHeader;
			createdPdfHeader="<head>\n";
			if(_title.length()>0) {
				createdPdfHeader+="<title>";
				createdPdfHeader+=_title;
				createdPdfHeader+="</title>\n";
			}
			if(_author.length()>0) {
				createdPdfHeader+="<author>";
				createdPdfHeader+=_author;
				createdPdfHeader+="</author>\n";
			}
			createdPdfHeader+="</head>\n";
			char *metastream = _documentTextBuffer.write( createdPdfHeader.length()+1 );
			strcpy(metastream, createdPdfHeader.c_str());
		  }
	  }
      doc->displayPages(textOut, firstPage, lastPage, hDPI, vDPI, rotate, useMediaBox, crop, printing);
    }
  }
  

  delete textOut;
  delete doc;

  _unparsedDocument.textLength = _documentTextBuffer.position();
  _unparsedDocument.contentLength = _unparsedDocument.textLength ? _documentTextBuffer.position() - 1 : 0 ; // no null 0 if text is empty.
  char* docnoPoint = _documentTextBuffer.write( _documentPath.length()+1 );
  strcpy( docnoPoint, _documentPath.c_str() );
  _unparsedDocument.text = _documentTextBuffer.front();
  _unparsedDocument.content = _documentTextBuffer.front();
  _unparsedDocument.metadata.clear();

  indri::parse::MetadataPair pair;

  pair.key = "path";
  pair.value = docnoPoint;
  pair.valueLength = _documentPath.length()+1;
  _unparsedDocument.metadata.push_back( pair );

  _docnostring.assign(_documentPath.c_str() );
  cleanDocno();
  pair.value = _docnostring.c_str();
  pair.valueLength = _docnostring.length()+1;
  pair.key = "docno";
  _unparsedDocument.metadata.push_back( pair );

  _documentPath = "";

  return &_unparsedDocument;
}