KoFilter::ConversionStatus PdfImport::convert(const QByteArray& from, const QByteArray& to) { debugPdf << "to:" << to << " from:" << from; if (from != "application/pdf" || to != "image/svg+xml") { return KoFilter::NotImplemented; } // read config file globalParams = new GlobalParams(); if (! globalParams) return KoFilter::NotImplemented; GooString * fname = new GooString(QFile::encodeName(m_chain->inputFile()).data()); PDFDoc * pdfDoc = new PDFDoc(fname, 0, 0, 0); if (! pdfDoc) { delete globalParams; return KoFilter::StupidError; } if (! pdfDoc->isOk()) { delete globalParams; delete pdfDoc; return KoFilter::StupidError; } double hDPI = 72.0; double vDPI = 72.0; int firstPage = 1; int lastPage = pdfDoc->getNumPages(); debugPdf << "converting pages" << firstPage << "-" << lastPage; SvgOutputDev * dev = new SvgOutputDev(m_chain->outputFile()); if (dev->isOk()) { int rotate = 0; GBool useMediaBox = gTrue; GBool crop = gFalse; GBool printing = gFalse; pdfDoc->displayPages(dev, firstPage, lastPage, hDPI, vDPI, rotate, useMediaBox, crop, printing); dev->dumpContent(); } debugPdf << "wrote file to" << m_chain->outputFile(); delete dev; delete pdfDoc; delete globalParams; globalParams = 0; // check for memory leaks Object::memCheck(stderr); return KoFilter::OK; }
int main(int argc, char *argv[]) { // parse args bool ok = parseArgs(argDesc, &argc, argv); if (!ok || argc < 2 || argc > 3 || printHelp) { fprintf(stderr, "pdftoipe version %s\n", PDFTOIPE_VERSION); printUsage("pdftoipe", "<PDF-file> [<XML-file>]", argDesc); return 1; } GooString *fileName = new GooString(argv[1]); globalParams = new GlobalParams(); if (quiet) globalParams->setErrQuiet(quiet); GooString *ownerPW, *userPW; if (ownerPassword[0]) { ownerPW = new GooString(ownerPassword); } else { ownerPW = 0; } if (userPassword[0]) { userPW = new GooString(userPassword); } else { userPW = 0; } // open PDF file PDFDoc *doc = new PDFDoc(fileName, ownerPW, userPW); delete userPW; delete ownerPW; if (!doc->isOk()) return 1; // construct XML file name std::string xmlFileName; if (argc == 3) { xmlFileName = argv[2]; } else { const char *p = fileName->c_str() + fileName->getLength() - 4; if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF")) { xmlFileName = std::string(fileName->c_str(), fileName->getLength() - 4); } else { xmlFileName = fileName->c_str(); } xmlFileName += ".ipe"; } // get page range if (firstPage < 1) firstPage = 1; if (lastPage < 1 || lastPage > doc->getNumPages()) lastPage = doc->getNumPages(); // write XML file XmlOutputDev *xmlOut = new XmlOutputDev(xmlFileName, doc->getXRef(), doc->getCatalog(), firstPage, lastPage); // tell output device about text handling xmlOut->setTextHandling(math, notext, literal, mergeLevel, unicodeLevel); int exitCode = 2; if (xmlOut->isOk()) { doc->displayPages(xmlOut, firstPage, lastPage, // double hDPI, double vDPI, int rotate, // bool useMediaBox, bool crop, bool printing, 72.0, 72.0, 0, false, false, false); exitCode = 0; } if (xmlOut->hasUnicode()) { fprintf(stderr, "The document contains Unicode (non-ASCII) text.\n"); if (unicodeLevel <= 1) fprintf(stderr, "Unknown Unicode characters were replaced by [U+XXX].\n"); else fprintf(stderr, "UTF-8 was set as document encoding in the preamble.\n"); } // clean up delete xmlOut; delete doc; delete globalParams; return exitCode; }
indri::parse::UnparsedDocument* indri::parse::PDFDocumentExtractor::nextDocument() { if( !_documentPath.length() ) return 0; PDFDoc* doc = 0; TextOutputDev* textOut = 0; GString* gfilename = new GString(_documentPath.c_str()); doc = new PDFDoc( gfilename ); // if the doc is not ok, or ok to copy, it // will be a document of length 0. if( doc->isOk() && doc->okToCopy() ) { void* stream = &_documentTextBuffer; textOut = new TextOutputDev( buffer_write, stream, gFalse, gFalse); if ( textOut->isOk() ) { int firstPage = 1; int lastPage = doc->getNumPages(); double hDPI=72.0; double vDPI=72.0; int rotate=0; GBool useMediaBox=gFalse; GBool crop=gTrue; GBool printing=gFalse; if(doc->readMetadata()!=NULL) { GString rawMetaData = doc->readMetadata(); GString preparedMetaData=""; //zoek <rdf:RDF en eindig bij </rdf:RDF>!! for(int x=0; x<rawMetaData.getLength(); x++) { if(rawMetaData.getChar(x)!='?' && rawMetaData.getChar(x)!=':') { //skip characters which the XMLReader doesn't understand preparedMetaData.append(rawMetaData.getChar(x)); } } std::string metaData(preparedMetaData.getCString()); int startbegin = metaData.find("<rdf"); int stopend = metaData.find(">", metaData.rfind("</rdf") ); metaData = metaData.substr(startbegin, (stopend-startbegin)+1 ); indri::xml::XMLReader reader; try { std::auto_ptr<indri::xml::XMLNode> result( reader.read( metaData.c_str() ) ); appendPdfMetaData( result.get() ); } catch( lemur::api::Exception& e ) { LEMUR_RETHROW( e, "Had trouble reading PDF metadata" ); } if( _author.length()>0 || _title.length()>0 ) { std::string createdPdfHeader; createdPdfHeader="<head>\n"; if(_title.length()>0) { createdPdfHeader+="<title>"; createdPdfHeader+=_title; createdPdfHeader+="</title>\n"; } if(_author.length()>0) { createdPdfHeader+="<author>"; createdPdfHeader+=_author; createdPdfHeader+="</author>\n"; } createdPdfHeader+="</head>\n"; char *metastream = _documentTextBuffer.write( createdPdfHeader.length()+1 ); strcpy(metastream, createdPdfHeader.c_str()); } } doc->displayPages(textOut, firstPage, lastPage, hDPI, vDPI, rotate, useMediaBox, crop, printing); } } delete textOut; delete doc; _unparsedDocument.textLength = _documentTextBuffer.position(); _unparsedDocument.contentLength = _unparsedDocument.textLength ? _documentTextBuffer.position() - 1 : 0 ; // no null 0 if text is empty. char* docnoPoint = _documentTextBuffer.write( _documentPath.length()+1 ); strcpy( docnoPoint, _documentPath.c_str() ); _unparsedDocument.text = _documentTextBuffer.front(); _unparsedDocument.content = _documentTextBuffer.front(); _unparsedDocument.metadata.clear(); indri::parse::MetadataPair pair; pair.key = "path"; pair.value = docnoPoint; pair.valueLength = _documentPath.length()+1; _unparsedDocument.metadata.push_back( pair ); _docnostring.assign(_documentPath.c_str() ); cleanDocno(); pair.value = _docnostring.c_str(); pair.valueLength = _docnostring.length()+1; pair.key = "docno"; _unparsedDocument.metadata.push_back( pair ); _documentPath = ""; return &_unparsedDocument; }