JNIEXPORT void JNICALL Java_com_foolabs_xpdf_PDFPage__1getText (JNIEnv *env, jobject obj, jobject document, jobject javaCollector, jboolean physicalLayout, jdouble fixedPitch, jboolean rawOrder) { Page *page = getHandle<Page>(env, obj); PDFDoc *doc = getHandle<PDFDoc>(env, document); TextCollector *collector = new TextCollector(env, javaCollector); GBool gPhysicalLayout = physicalLayout ? gTrue : gFalse; GBool gRawOrder = rawOrder ? gTrue : gFalse; TextOutputDev *outputDevice = new TextOutputDev(&TextCollector::CollectText, collector, gPhysicalLayout, gRawOrder); if (outputDevice->isOk()) { const double hDPI = 72; const double vDPI = 72; const int rotate = 0; const GBool useMediaBox = gFalse; const GBool crop = gTrue; const GBool printing = gFalse; Catalog *catalog = doc->getCatalog(); page->display(outputDevice, hDPI, vDPI, rotate, useMediaBox, crop, printing, catalog); } delete collector; // All text should already be in Java delete outputDevice; }
indri::parse::UnparsedDocument* indri::parse::PDFDocumentExtractor::nextDocument() { if( !_documentPath.length() ) return 0; PDFDoc* doc = 0; TextOutputDev* textOut = 0; GString* gfilename = new GString(_documentPath.c_str()); doc = new PDFDoc( gfilename ); // if the doc is not ok, or ok to copy, it // will be a document of length 0. if( doc->isOk() && doc->okToCopy() ) { void* stream = &_documentTextBuffer; textOut = new TextOutputDev( buffer_write, stream, gFalse, gFalse); if ( textOut->isOk() ) { int firstPage = 1; int lastPage = doc->getNumPages(); double hDPI=72.0; double vDPI=72.0; int rotate=0; GBool useMediaBox=gFalse; GBool crop=gTrue; GBool printing=gFalse; if(doc->readMetadata()!=NULL) { GString rawMetaData = doc->readMetadata(); GString preparedMetaData=""; //zoek <rdf:RDF en eindig bij </rdf:RDF>!! for(int x=0; x<rawMetaData.getLength(); x++) { if(rawMetaData.getChar(x)!='?' && rawMetaData.getChar(x)!=':') { //skip characters which the XMLReader doesn't understand preparedMetaData.append(rawMetaData.getChar(x)); } } std::string metaData(preparedMetaData.getCString()); int startbegin = metaData.find("<rdf"); int stopend = metaData.find(">", metaData.rfind("</rdf") ); metaData = metaData.substr(startbegin, (stopend-startbegin)+1 ); indri::xml::XMLReader reader; try { std::auto_ptr<indri::xml::XMLNode> result( reader.read( metaData.c_str() ) ); appendPdfMetaData( result.get() ); } catch( lemur::api::Exception& e ) { LEMUR_RETHROW( e, "Had trouble reading PDF metadata" ); } if( _author.length()>0 || _title.length()>0 ) { std::string createdPdfHeader; createdPdfHeader="<head>\n"; if(_title.length()>0) { createdPdfHeader+="<title>"; createdPdfHeader+=_title; createdPdfHeader+="</title>\n"; } if(_author.length()>0) { createdPdfHeader+="<author>"; createdPdfHeader+=_author; createdPdfHeader+="</author>\n"; } createdPdfHeader+="</head>\n"; char *metastream = _documentTextBuffer.write( createdPdfHeader.length()+1 ); strcpy(metastream, createdPdfHeader.c_str()); } } doc->displayPages(textOut, firstPage, lastPage, hDPI, vDPI, rotate, useMediaBox, crop, printing); } } delete textOut; delete doc; _unparsedDocument.textLength = _documentTextBuffer.position(); _unparsedDocument.contentLength = _unparsedDocument.textLength ? _documentTextBuffer.position() - 1 : 0 ; // no null 0 if text is empty. char* docnoPoint = _documentTextBuffer.write( _documentPath.length()+1 ); strcpy( docnoPoint, _documentPath.c_str() ); _unparsedDocument.text = _documentTextBuffer.front(); _unparsedDocument.content = _documentTextBuffer.front(); _unparsedDocument.metadata.clear(); indri::parse::MetadataPair pair; pair.key = "path"; pair.value = docnoPoint; pair.valueLength = _documentPath.length()+1; _unparsedDocument.metadata.push_back( pair ); _docnostring.assign(_documentPath.c_str() ); cleanDocno(); pair.value = _docnostring.c_str(); pair.valueLength = _docnostring.length()+1; pair.key = "docno"; _unparsedDocument.metadata.push_back( pair ); _documentPath = ""; return &_unparsedDocument; }