indri::parse::UnparsedDocument* indri::parse::TextDocumentExtractor::nextDocument() { _buffer.clear(); _document.text = 0; _document.textLength = 0; _document.metadata.clear(); if( gzeof( _in ) ) return 0; // set up metadata indri::parse::MetadataPair pair; pair.value = _filename.c_str(); pair.valueLength = _filename.length()+1; pair.key = "path"; _document.metadata.push_back( pair ); _docnostring.assign(_filename.c_str() ); cleanDocno(); pair.value = _docnostring.c_str(); pair.valueLength = _docnostring.length()+1; pair.key = "docno"; _document.metadata.push_back( pair ); pair.key = "filetype"; pair.value = (void*) "TEXT"; pair.valueLength = 5; _document.metadata.push_back( pair ); // get document text while( !gzeof(_in) ) { int readChunk = 1024*1024; char* textSpot = _buffer.write(readChunk); int read = gzread( _in, textSpot, readChunk ); _buffer.unwrite( readChunk - read ); } *_buffer.write(1) = 0; _document.text = _buffer.front(); _document.textLength = _buffer.position(); _document.content = _buffer.front(); _document.contentLength = _buffer.position() - 1; // no null return &_document; }
indri::parse::UnparsedDocument* indri::parse::MboxDocumentIterator::nextDocument() { _buffer.clear(); _document.text = 0; _document.textLength = 0; _document.content = 0; _document.contentLength = 0; _document.metadata.clear(); if( _in.eof() ) return 0; char headerLine[ MBOX_MAX_HEADER_LINE_LENGTH ]; // skim past all unnecessary headers // want to catch: // recipient // author // subject // date // all of these need to be metadata and indexed content static const field_t fields[] = { { "From:", "author", 5 }, { "To:", "recipient", 3 }, { "Subject:", "subject", 8 }, { "Cc:", "copied", 3 }, { "Date:", "date", 5 } }; int field = -1; if( !_in.eof() ) { _in.getline( headerLine, MBOX_MAX_HEADER_LINE_LENGTH ); } while( !_in.eof() ) { // if the line is empty, we're done with this header if( !strcmp( headerLine, MBOX_EMPTY_LINE ) ) break; // record the number of bytes read int lineLength = _in.gcount(); int extraLength = 0; // is this an interesting line? for( int i=0; i<(sizeof fields/sizeof fields[0]); i++ ) { if( !strncmp( fields[i].field, headerLine, fields[i].length ) ) { field = i; break; } } // if this is an interesting line, do some special processing if( field >= 0 ) { // some fields are multi-line; these fields start with a tab character. // therefore, we'll try to fetch more lines now while( !_in.eof() ) { _in.getline( headerLine + lineLength, MBOX_MAX_HEADER_LINE_LENGTH - lineLength ); extraLength = _in.gcount(); if( headerLine[lineLength] != '\t' ) { break; } else { // add a newline where the '\0' was headerLine[lineLength-1] = '\n'; lineLength += extraLength; extraLength = 0; } } // now, copy to metadata _copyMetadata( headerLine, fields[field].length, fields[field].tag ); // move next line data to beginning of buffer memmove( headerLine, headerLine + lineLength, extraLength ); // clear field field = -1; } else { _in.getline( headerLine, MBOX_MAX_HEADER_LINE_LENGTH ); } } // now, we're catching message text // we will stop (and throw out content) as soon as we see a "From" line while( !_in.eof() ) { int readChunk = 1024*1024; char* textSpot = _buffer.write(readChunk); _in.getline( textSpot, readChunk ); // add in the newline that was replaced by a '\0' int actual = _in.gcount(); _buffer.unwrite( readChunk - actual ); textSpot[actual-1] = '\n'; // done reading at a "From" line if( !strncmp( textSpot, "From", 4 ) ) { _buffer.unwrite( _in.gcount() ); break; } } // terminate string *_buffer.write(1) = 0; // fix up existing metadata for( size_t i=0; i<_document.metadata.size(); i++ ) { size_t offset = (size_t) _document.metadata[i].value; _document.metadata[i].value = _metaBuffer.front() + offset; } // add type metadata indri::parse::MetadataPair pair; pair.key = "filetype"; pair.value = (void*) "MBOX"; pair.valueLength = sizeof "MBOX"; _document.metadata.push_back( pair ); // copy subject into docno for( size_t i=0; i<_document.metadata.size(); i++ ) { if( !strcmp( "subject", _document.metadata[i].key ) ) { _docnostring.assign((char *)_document.metadata[i].value ); cleanDocno(); pair.value = _docnostring.c_str(); pair.valueLength = _docnostring.length()+1; pair.key = "docno"; _document.metadata.push_back( pair ); break; } } _document.text = _buffer.front(); _document.textLength = _buffer.position(); _document.content = _buffer.front(); _document.contentLength = _buffer.position(); return &_document; }
indri::parse::UnparsedDocument* indri::parse::PDFDocumentExtractor::nextDocument() { if( !_documentPath.length() ) return 0; PDFDoc* doc = 0; TextOutputDev* textOut = 0; GString* gfilename = new GString(_documentPath.c_str()); doc = new PDFDoc( gfilename ); // if the doc is not ok, or ok to copy, it // will be a document of length 0. if( doc->isOk() && doc->okToCopy() ) { void* stream = &_documentTextBuffer; textOut = new TextOutputDev( buffer_write, stream, gFalse, gFalse); if ( textOut->isOk() ) { int firstPage = 1; int lastPage = doc->getNumPages(); double hDPI=72.0; double vDPI=72.0; int rotate=0; GBool useMediaBox=gFalse; GBool crop=gTrue; GBool printing=gFalse; if(doc->readMetadata()!=NULL) { GString rawMetaData = doc->readMetadata(); GString preparedMetaData=""; //zoek <rdf:RDF en eindig bij </rdf:RDF>!! for(int x=0; x<rawMetaData.getLength(); x++) { if(rawMetaData.getChar(x)!='?' && rawMetaData.getChar(x)!=':') { //skip characters which the XMLReader doesn't understand preparedMetaData.append(rawMetaData.getChar(x)); } } std::string metaData(preparedMetaData.getCString()); int startbegin = metaData.find("<rdf"); int stopend = metaData.find(">", metaData.rfind("</rdf") ); metaData = metaData.substr(startbegin, (stopend-startbegin)+1 ); indri::xml::XMLReader reader; try { std::auto_ptr<indri::xml::XMLNode> result( reader.read( metaData.c_str() ) ); appendPdfMetaData( result.get() ); } catch( lemur::api::Exception& e ) { LEMUR_RETHROW( e, "Had trouble reading PDF metadata" ); } if( _author.length()>0 || _title.length()>0 ) { std::string createdPdfHeader; createdPdfHeader="<head>\n"; if(_title.length()>0) { createdPdfHeader+="<title>"; createdPdfHeader+=_title; createdPdfHeader+="</title>\n"; } if(_author.length()>0) { createdPdfHeader+="<author>"; createdPdfHeader+=_author; createdPdfHeader+="</author>\n"; } createdPdfHeader+="</head>\n"; char *metastream = _documentTextBuffer.write( createdPdfHeader.length()+1 ); strcpy(metastream, createdPdfHeader.c_str()); } } doc->displayPages(textOut, firstPage, lastPage, hDPI, vDPI, rotate, useMediaBox, crop, printing); } } delete textOut; delete doc; _unparsedDocument.textLength = _documentTextBuffer.position(); _unparsedDocument.contentLength = _unparsedDocument.textLength ? _documentTextBuffer.position() - 1 : 0 ; // no null 0 if text is empty. char* docnoPoint = _documentTextBuffer.write( _documentPath.length()+1 ); strcpy( docnoPoint, _documentPath.c_str() ); _unparsedDocument.text = _documentTextBuffer.front(); _unparsedDocument.content = _documentTextBuffer.front(); _unparsedDocument.metadata.clear(); indri::parse::MetadataPair pair; pair.key = "path"; pair.value = docnoPoint; pair.valueLength = _documentPath.length()+1; _unparsedDocument.metadata.push_back( pair ); _docnostring.assign(_documentPath.c_str() ); cleanDocno(); pair.value = _docnostring.c_str(); pair.valueLength = _docnostring.length()+1; pair.key = "docno"; _unparsedDocument.metadata.push_back( pair ); _documentPath = ""; return &_unparsedDocument; }