Пример #1
0
KoFilter::ConversionStatus PdfImport::convert(const QByteArray& from, const QByteArray& to)
{
    debugPdf << "to:" << to << " from:" << from;

    if (from != "application/pdf" || to != "image/svg+xml") {
        return KoFilter::NotImplemented;
    }

    // read config file
    globalParams = new GlobalParams();
    if (! globalParams)
        return KoFilter::NotImplemented;

    GooString * fname = new GooString(QFile::encodeName(m_chain->inputFile()).data());
    PDFDoc * pdfDoc = new PDFDoc(fname, 0, 0, 0);
    if (! pdfDoc) {
        delete globalParams;
        return KoFilter::StupidError;
    }

    if (! pdfDoc->isOk()) {
        delete globalParams;
        delete pdfDoc;
        return KoFilter::StupidError;
    }

    double hDPI = 72.0;
    double vDPI = 72.0;

    int firstPage = 1;
    int lastPage = pdfDoc->getNumPages();

    debugPdf << "converting pages" << firstPage << "-" << lastPage;

    SvgOutputDev * dev = new SvgOutputDev(m_chain->outputFile());
    if (dev->isOk()) {
        int rotate = 0;
        GBool useMediaBox = gTrue;
        GBool crop = gFalse;
        GBool printing = gFalse;
        pdfDoc->displayPages(dev, firstPage, lastPage, hDPI, vDPI, rotate, useMediaBox, crop, printing);
        dev->dumpContent();
    }

    debugPdf << "wrote file to" << m_chain->outputFile();

    delete dev;
    delete pdfDoc;
    delete globalParams;
    globalParams = 0;

    // check for memory leaks
    Object::memCheck(stderr);

    return KoFilter::OK;
}
Пример #2
0
int main(int argc, char *argv[])
{
  // parse args
  bool ok = parseArgs(argDesc, &argc, argv);
  if (!ok || argc < 2 || argc > 3 || printHelp) {
    fprintf(stderr, "pdftoipe version %s\n", PDFTOIPE_VERSION);
    printUsage("pdftoipe", "<PDF-file> [<XML-file>]", argDesc);
    return 1;
  }

  GooString *fileName = new GooString(argv[1]);

  globalParams = new GlobalParams();
  if (quiet)
    globalParams->setErrQuiet(quiet);

  GooString *ownerPW, *userPW;
  if (ownerPassword[0]) {
    ownerPW = new GooString(ownerPassword);
  } else {
    ownerPW = 0;
  }
  if (userPassword[0]) {
    userPW = new GooString(userPassword);
  } else {
    userPW = 0;
  }

  // open PDF file
  PDFDoc *doc = new PDFDoc(fileName, ownerPW, userPW);
  delete userPW;
  delete ownerPW;

  if (!doc->isOk())
    return 1;
  
  // construct XML file name
  std::string xmlFileName;
  if (argc == 3) {
    xmlFileName = argv[2];
  } else {
    const char *p = fileName->c_str() + fileName->getLength() - 4;
    if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF")) {
        xmlFileName = std::string(fileName->c_str(),
                                  fileName->getLength() - 4);
    } else {
      xmlFileName = fileName->c_str();
    }
    xmlFileName += ".ipe";
  }

  // get page range
  if (firstPage < 1)
    firstPage = 1;

  if (lastPage < 1 || lastPage > doc->getNumPages())
    lastPage = doc->getNumPages();

  // write XML file
  XmlOutputDev *xmlOut = 
    new XmlOutputDev(xmlFileName, doc->getXRef(),
                     doc->getCatalog(), firstPage, lastPage);

  // tell output device about text handling
  xmlOut->setTextHandling(math, notext, literal, mergeLevel, unicodeLevel);
  
  int exitCode = 2;
  if (xmlOut->isOk()) {
    doc->displayPages(xmlOut, firstPage, lastPage, 
		      // double hDPI, double vDPI, int rotate,
		      // bool useMediaBox, bool crop, bool printing,
		      72.0, 72.0, 0, false, false, false);
    exitCode = 0;
  }

  if (xmlOut->hasUnicode()) {
    fprintf(stderr, "The document contains Unicode (non-ASCII) text.\n");
    if (unicodeLevel <= 1)
      fprintf(stderr, "Unknown Unicode characters were replaced by [U+XXX].\n");
    else
      fprintf(stderr, "UTF-8 was set as document encoding in the preamble.\n");
  }

  // clean up
  delete xmlOut;
  delete doc;
  delete globalParams;

  return exitCode;
}
Пример #3
0
indri::parse::UnparsedDocument* indri::parse::PDFDocumentExtractor::nextDocument() {
  if( !_documentPath.length() )
    return 0;

  PDFDoc* doc = 0;
  TextOutputDev* textOut = 0;
  GString* gfilename = new GString(_documentPath.c_str());
  doc = new PDFDoc( gfilename );
  // if the doc is not ok, or ok to copy, it
  // will be a document of length 0.
  if( doc->isOk() && doc->okToCopy() ) {
    void* stream = &_documentTextBuffer;
    textOut = new TextOutputDev( buffer_write, stream, gFalse, gFalse);
    if ( textOut->isOk() ) {
      int firstPage = 1;
      int lastPage = doc->getNumPages();
	  double hDPI=72.0;
	  double vDPI=72.0;
	  int rotate=0;
	  GBool useMediaBox=gFalse;
	  GBool crop=gTrue; 
	  GBool printing=gFalse; 
	  if(doc->readMetadata()!=NULL)
	  {
		  GString rawMetaData = doc->readMetadata();
		  GString preparedMetaData="";

		  //zoek <rdf:RDF  en eindig bij </rdf:RDF>!! 
		  for(int x=0; x<rawMetaData.getLength(); x++) {
			  if(rawMetaData.getChar(x)!='?' && rawMetaData.getChar(x)!=':') {
				  //skip characters which the XMLReader doesn't understand
				  preparedMetaData.append(rawMetaData.getChar(x));
			  }
		  }
		  std::string metaData(preparedMetaData.getCString());
		  int startbegin = metaData.find("<rdf");
		  int stopend = metaData.find(">", metaData.rfind("</rdf") );
		  metaData = metaData.substr(startbegin, (stopend-startbegin)+1 );
	  

     	  indri::xml::XMLReader reader;

		  try {
			  std::auto_ptr<indri::xml::XMLNode> result( reader.read( metaData.c_str() ) );
			  appendPdfMetaData( result.get() );
		  } catch( lemur::api::Exception& e ) {
			LEMUR_RETHROW( e, "Had trouble reading PDF metadata" );
		  } 
		  if( _author.length()>0 || _title.length()>0 )
		  {
			std::string createdPdfHeader;
			createdPdfHeader="<head>\n";
			if(_title.length()>0) {
				createdPdfHeader+="<title>";
				createdPdfHeader+=_title;
				createdPdfHeader+="</title>\n";
			}
			if(_author.length()>0) {
				createdPdfHeader+="<author>";
				createdPdfHeader+=_author;
				createdPdfHeader+="</author>\n";
			}
			createdPdfHeader+="</head>\n";
			char *metastream = _documentTextBuffer.write( createdPdfHeader.length()+1 );
			strcpy(metastream, createdPdfHeader.c_str());
		  }
	  }
      doc->displayPages(textOut, firstPage, lastPage, hDPI, vDPI, rotate, useMediaBox, crop, printing);
    }
  }
  

  delete textOut;
  delete doc;

  _unparsedDocument.textLength = _documentTextBuffer.position();
  _unparsedDocument.contentLength = _unparsedDocument.textLength ? _documentTextBuffer.position() - 1 : 0 ; // no null 0 if text is empty.
  char* docnoPoint = _documentTextBuffer.write( _documentPath.length()+1 );
  strcpy( docnoPoint, _documentPath.c_str() );
  _unparsedDocument.text = _documentTextBuffer.front();
  _unparsedDocument.content = _documentTextBuffer.front();
  _unparsedDocument.metadata.clear();

  indri::parse::MetadataPair pair;

  pair.key = "path";
  pair.value = docnoPoint;
  pair.valueLength = _documentPath.length()+1;
  _unparsedDocument.metadata.push_back( pair );

  _docnostring.assign(_documentPath.c_str() );
  cleanDocno();
  pair.value = _docnostring.c_str();
  pair.valueLength = _docnostring.length()+1;
  pair.key = "docno";
  _unparsedDocument.metadata.push_back( pair );

  _documentPath = "";

  return &_unparsedDocument;
}