JNIEXPORT void JNICALL Java_com_foolabs_xpdf_PDFPage__1getText (JNIEnv *env, jobject obj, jobject document, jobject javaCollector, jboolean physicalLayout, jdouble fixedPitch, jboolean rawOrder) { Page *page = getHandle<Page>(env, obj); PDFDoc *doc = getHandle<PDFDoc>(env, document); TextCollector *collector = new TextCollector(env, javaCollector); GBool gPhysicalLayout = physicalLayout ? gTrue : gFalse; GBool gRawOrder = rawOrder ? gTrue : gFalse; TextOutputDev *outputDevice = new TextOutputDev(&TextCollector::CollectText, collector, gPhysicalLayout, gRawOrder); if (outputDevice->isOk()) { const double hDPI = 72; const double vDPI = 72; const int rotate = 0; const GBool useMediaBox = gFalse; const GBool crop = gTrue; const GBool printing = gFalse; Catalog *catalog = doc->getCatalog(); page->display(outputDevice, hDPI, vDPI, rotate, useMediaBox, crop, printing, catalog); } delete collector; // All text should already be in Java delete outputDevice; }
void XojPopplerPage::initTextPage() { XOJ_CHECK_TYPE(XojPopplerPage); g_mutex_lock(&this->renderMutex); if (this->text == NULL) { g_mutex_lock(this->docMutex); TextOutputDev *textDev = new TextOutputDev(NULL, true, /* TODO POPPLER : Check value */ 0, false, false); Gfx *gfx = this->page->createGfx(textDev, 72.0, 72.0, 0, false, /* useMediaBox */ true, /* Crop */ -1, -1, -1, -1, false, /* printing */ NULL, NULL); this->page->display(gfx); textDev->endPage(); this->text = textDev->takeText(); delete gfx; delete textDev; g_mutex_unlock(this->docMutex); } g_mutex_unlock(&this->renderMutex); }
void NSRPopplerDocument::renderPage(int page) { double dpix, dpiy; if (_doc == NULL || page > getNumberOfPages() || page < 1) return; _page = _catalog->getPage(page); if (isTextOnly()) { PDFRectangle *rect; GooString *text; TextOutputDev *dev; dev = new TextOutputDev (0, gFalse, gFalse, gFalse); _doc->displayPageSlice(dev, _page->getNum(), 72, 72, 0, gFalse, gTrue, gFalse, -1, -1, -1, -1); rect = _page->getCropBox(); text = dev->getText(rect->x1, rect->y1, rect->x2, rect->y2); _text = processText(QString::fromUtf8(text->getCString())); delete text; delete dev; _readyForLoad = true; return; } if (isZoomToWidth()) { double wZoom = ((double) getScreenWidth() / (double) _page->getCropWidth() * 100.0); setZoomSilent((int) wZoom); } if (getZoom() > getMaxZoom()) setZoomSilent (getMaxZoom()); else if (getZoom() < getMinZoom()) setZoomSilent (getMinZoom()); if (_readyForLoad) _dev->startPage(0, NULL); dpix = _dpix * getZoom() / 100.0; dpiy = _dpiy * getZoom() / 100.0; _page->display(_dev, dpix, dpiy, getRotation(), gFalse, gFalse, gTrue, _catalog); _readyForLoad = true; }
indri::parse::UnparsedDocument* indri::parse::PDFDocumentExtractor::nextDocument() { if( !_documentPath.length() ) return 0; PDFDoc* doc = 0; TextOutputDev* textOut = 0; GString* gfilename = new GString(_documentPath.c_str()); doc = new PDFDoc( gfilename ); // if the doc is not ok, or ok to copy, it // will be a document of length 0. if( doc->isOk() && doc->okToCopy() ) { void* stream = &_documentTextBuffer; textOut = new TextOutputDev( buffer_write, stream, gFalse, gFalse); if ( textOut->isOk() ) { int firstPage = 1; int lastPage = doc->getNumPages(); double hDPI=72.0; double vDPI=72.0; int rotate=0; GBool useMediaBox=gFalse; GBool crop=gTrue; GBool printing=gFalse; if(doc->readMetadata()!=NULL) { GString rawMetaData = doc->readMetadata(); GString preparedMetaData=""; //zoek <rdf:RDF en eindig bij </rdf:RDF>!! for(int x=0; x<rawMetaData.getLength(); x++) { if(rawMetaData.getChar(x)!='?' && rawMetaData.getChar(x)!=':') { //skip characters which the XMLReader doesn't understand preparedMetaData.append(rawMetaData.getChar(x)); } } std::string metaData(preparedMetaData.getCString()); int startbegin = metaData.find("<rdf"); int stopend = metaData.find(">", metaData.rfind("</rdf") ); metaData = metaData.substr(startbegin, (stopend-startbegin)+1 ); indri::xml::XMLReader reader; try { std::auto_ptr<indri::xml::XMLNode> result( reader.read( metaData.c_str() ) ); appendPdfMetaData( result.get() ); } catch( lemur::api::Exception& e ) { LEMUR_RETHROW( e, "Had trouble reading PDF metadata" ); } if( _author.length()>0 || _title.length()>0 ) { std::string createdPdfHeader; createdPdfHeader="<head>\n"; if(_title.length()>0) { createdPdfHeader+="<title>"; createdPdfHeader+=_title; createdPdfHeader+="</title>\n"; } if(_author.length()>0) { createdPdfHeader+="<author>"; createdPdfHeader+=_author; createdPdfHeader+="</author>\n"; } createdPdfHeader+="</head>\n"; char *metastream = _documentTextBuffer.write( createdPdfHeader.length()+1 ); strcpy(metastream, createdPdfHeader.c_str()); } } doc->displayPages(textOut, firstPage, lastPage, hDPI, vDPI, rotate, useMediaBox, crop, printing); } } delete textOut; delete doc; _unparsedDocument.textLength = _documentTextBuffer.position(); _unparsedDocument.contentLength = _unparsedDocument.textLength ? _documentTextBuffer.position() - 1 : 0 ; // no null 0 if text is empty. char* docnoPoint = _documentTextBuffer.write( _documentPath.length()+1 ); strcpy( docnoPoint, _documentPath.c_str() ); _unparsedDocument.text = _documentTextBuffer.front(); _unparsedDocument.content = _documentTextBuffer.front(); _unparsedDocument.metadata.clear(); indri::parse::MetadataPair pair; pair.key = "path"; pair.value = docnoPoint; pair.valueLength = _documentPath.length()+1; _unparsedDocument.metadata.push_back( pair ); _docnostring.assign(_documentPath.c_str() ); cleanDocno(); pair.value = _docnostring.c_str(); pair.valueLength = _docnostring.length()+1; pair.key = "docno"; _unparsedDocument.metadata.push_back( pair ); _documentPath = ""; return &_unparsedDocument; }