indri::parse::UnparsedDocument* indri::parse::TextDocumentExtractor::nextDocument() {
  _buffer.clear();
  _document.text = 0;
  _document.textLength = 0;
  _document.metadata.clear();

  if( gzeof( _in ) )
    return 0;

  // set up metadata
  indri::parse::MetadataPair pair;
  pair.value = _filename.c_str();
  pair.valueLength = _filename.length()+1;
  pair.key = "path";
  _document.metadata.push_back( pair );

  _docnostring.assign(_filename.c_str() );
  cleanDocno();
  pair.value = _docnostring.c_str();
  pair.valueLength = _docnostring.length()+1;
  pair.key = "docno";
  _document.metadata.push_back( pair );

  pair.key = "filetype";
  pair.value = (void*) "TEXT";
  pair.valueLength = 5;
  _document.metadata.push_back( pair );

  // get document text
  while( !gzeof(_in) ) {
    int readChunk = 1024*1024;
    char* textSpot = _buffer.write(readChunk);
    int read = gzread( _in, textSpot, readChunk );
    _buffer.unwrite( readChunk - read );
  }
  *_buffer.write(1) = 0;

  _document.text = _buffer.front();
  _document.textLength = _buffer.position();
  _document.content = _buffer.front();
  _document.contentLength = _buffer.position() - 1; // no null

  return &_document;
}
indri::parse::UnparsedDocument* indri::parse::MboxDocumentIterator::nextDocument() {
  _buffer.clear();
  _document.text = 0;
  _document.textLength = 0;
  _document.content = 0;
  _document.contentLength = 0;
  _document.metadata.clear();

  if( _in.eof() )
    return 0;

  char headerLine[ MBOX_MAX_HEADER_LINE_LENGTH ];

  // skim past all unnecessary headers
  // want to catch:
  //    recipient
  //    author
  //    subject    
  //    date
  // all of these need to be metadata and indexed content

  static const field_t fields[] = {
    { "From:", "author", 5 },
    { "To:", "recipient", 3 },
    { "Subject:", "subject", 8 },
    { "Cc:", "copied", 3 },
    { "Date:", "date", 5 }
  };

  int field = -1;

  if( !_in.eof() ) {
    _in.getline( headerLine, MBOX_MAX_HEADER_LINE_LENGTH );
  }

  while( !_in.eof() ) {
    // if the line is empty, we're done with this header
    if( !strcmp( headerLine, MBOX_EMPTY_LINE ) )
      break;

    // record the number of bytes read
    int lineLength = _in.gcount();
    int extraLength = 0;

    // is this an interesting line?
    for( int i=0; i<(sizeof fields/sizeof fields[0]); i++ ) {
      if( !strncmp( fields[i].field, headerLine, fields[i].length ) ) {
        field = i;
        break;
      }
    }

    // if this is an interesting line, do some special processing
    if( field >= 0 ) {
      // some fields are multi-line; these fields start with a tab character.
      // therefore, we'll try to fetch more lines now
      while( !_in.eof() ) {
        _in.getline( headerLine + lineLength, MBOX_MAX_HEADER_LINE_LENGTH - lineLength );
        extraLength = _in.gcount();
        
        if( headerLine[lineLength] != '\t' ) {
          break;
        } else {
          // add a newline where the '\0' was
          headerLine[lineLength-1] = '\n';
          lineLength += extraLength;
          extraLength = 0;
        }
      }

      // now, copy to metadata
      _copyMetadata( headerLine, fields[field].length, fields[field].tag );

      // move next line data to beginning of buffer
      memmove( headerLine, headerLine + lineLength, extraLength );

      // clear field
      field = -1;
    } else {
      _in.getline( headerLine, MBOX_MAX_HEADER_LINE_LENGTH );
    }
  }

  // now, we're catching message text
  // we will stop (and throw out content) as soon as we see a "From" line
  while( !_in.eof() ) {
    int readChunk = 1024*1024;
    char* textSpot = _buffer.write(readChunk);
    _in.getline( textSpot, readChunk );

    // add in the newline that was replaced by a '\0'
    int actual = _in.gcount();
    _buffer.unwrite( readChunk - actual );
    textSpot[actual-1] = '\n';

    // done reading at a "From" line
    if( !strncmp( textSpot, "From", 4 ) ) {
      _buffer.unwrite( _in.gcount() );
      break;
    }
  }

  // terminate string
  *_buffer.write(1) = 0;

  // fix up existing metadata
  for( size_t i=0; i<_document.metadata.size(); i++ ) {
    size_t offset = (size_t) _document.metadata[i].value;
    _document.metadata[i].value = _metaBuffer.front() + offset;
  }

  // add type metadata
  indri::parse::MetadataPair pair;
  pair.key = "filetype";
  pair.value = (void*) "MBOX";
  pair.valueLength = sizeof "MBOX";
  _document.metadata.push_back( pair );

  // copy subject into docno
  for( size_t i=0; i<_document.metadata.size(); i++ ) {
    if( !strcmp( "subject", _document.metadata[i].key ) ) {
      _docnostring.assign((char *)_document.metadata[i].value );
      cleanDocno();
      pair.value = _docnostring.c_str();
      pair.valueLength = _docnostring.length()+1;
      pair.key = "docno";
      _document.metadata.push_back( pair );
      break;
    }
  }

  _document.text = _buffer.front();
  _document.textLength = _buffer.position();
  _document.content = _buffer.front();
  _document.contentLength = _buffer.position();

  return &_document;
}
示例#3
0
indri::parse::UnparsedDocument* indri::parse::PDFDocumentExtractor::nextDocument() {
  if( !_documentPath.length() )
    return 0;

  PDFDoc* doc = 0;
  TextOutputDev* textOut = 0;
  GString* gfilename = new GString(_documentPath.c_str());
  doc = new PDFDoc( gfilename );
  // if the doc is not ok, or ok to copy, it
  // will be a document of length 0.
  if( doc->isOk() && doc->okToCopy() ) {
    void* stream = &_documentTextBuffer;
    textOut = new TextOutputDev( buffer_write, stream, gFalse, gFalse);
    if ( textOut->isOk() ) {
      int firstPage = 1;
      int lastPage = doc->getNumPages();
	  double hDPI=72.0;
	  double vDPI=72.0;
	  int rotate=0;
	  GBool useMediaBox=gFalse;
	  GBool crop=gTrue; 
	  GBool printing=gFalse; 
	  if(doc->readMetadata()!=NULL)
	  {
		  GString rawMetaData = doc->readMetadata();
		  GString preparedMetaData="";

		  //zoek <rdf:RDF  en eindig bij </rdf:RDF>!! 
		  for(int x=0; x<rawMetaData.getLength(); x++) {
			  if(rawMetaData.getChar(x)!='?' && rawMetaData.getChar(x)!=':') {
				  //skip characters which the XMLReader doesn't understand
				  preparedMetaData.append(rawMetaData.getChar(x));
			  }
		  }
		  std::string metaData(preparedMetaData.getCString());
		  int startbegin = metaData.find("<rdf");
		  int stopend = metaData.find(">", metaData.rfind("</rdf") );
		  metaData = metaData.substr(startbegin, (stopend-startbegin)+1 );
	  

     	  indri::xml::XMLReader reader;

		  try {
			  std::auto_ptr<indri::xml::XMLNode> result( reader.read( metaData.c_str() ) );
			  appendPdfMetaData( result.get() );
		  } catch( lemur::api::Exception& e ) {
			LEMUR_RETHROW( e, "Had trouble reading PDF metadata" );
		  } 
		  if( _author.length()>0 || _title.length()>0 )
		  {
			std::string createdPdfHeader;
			createdPdfHeader="<head>\n";
			if(_title.length()>0) {
				createdPdfHeader+="<title>";
				createdPdfHeader+=_title;
				createdPdfHeader+="</title>\n";
			}
			if(_author.length()>0) {
				createdPdfHeader+="<author>";
				createdPdfHeader+=_author;
				createdPdfHeader+="</author>\n";
			}
			createdPdfHeader+="</head>\n";
			char *metastream = _documentTextBuffer.write( createdPdfHeader.length()+1 );
			strcpy(metastream, createdPdfHeader.c_str());
		  }
	  }
      doc->displayPages(textOut, firstPage, lastPage, hDPI, vDPI, rotate, useMediaBox, crop, printing);
    }
  }
  

  delete textOut;
  delete doc;

  _unparsedDocument.textLength = _documentTextBuffer.position();
  _unparsedDocument.contentLength = _unparsedDocument.textLength ? _documentTextBuffer.position() - 1 : 0 ; // no null 0 if text is empty.
  char* docnoPoint = _documentTextBuffer.write( _documentPath.length()+1 );
  strcpy( docnoPoint, _documentPath.c_str() );
  _unparsedDocument.text = _documentTextBuffer.front();
  _unparsedDocument.content = _documentTextBuffer.front();
  _unparsedDocument.metadata.clear();

  indri::parse::MetadataPair pair;

  pair.key = "path";
  pair.value = docnoPoint;
  pair.valueLength = _documentPath.length()+1;
  _unparsedDocument.metadata.push_back( pair );

  _docnostring.assign(_documentPath.c_str() );
  cleanDocno();
  pair.value = _docnostring.c_str();
  pair.valueLength = _docnostring.length()+1;
  pair.key = "docno";
  _unparsedDocument.metadata.push_back( pair );

  _documentPath = "";

  return &_unparsedDocument;
}