bool HelperEndAnalyzer::checkForFile(const AnalysisResult& idx) const { if (idx.depth() > 0) return false; struct stat s; if (stat(idx.path().c_str(), &s)) return false; return true; }
signed char LzmaEndAnalyzer::analyze(AnalysisResult& idx, InputStream* in) { if(!in) return -1; LZMAInputStream stream(in); // since this is lzma file, its likely that it contains a tar file const char* start = 0; int32_t nread = stream.read(start, 1024, 0); if (nread < -1) { fprintf(stderr, "Error reading lzma: %s\n", stream.error()); return -2; } idx.addValue(factory->typeField, "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Archive"); stream.reset(0); if (TarInputStream::checkHeader(start, nread)) { return TarEndAnalyzer::staticAnalyze(idx, &stream); } else { std::string name = idx.fileName(); string::size_type len = name.length(); if (len > 5 && name.substr(len-5)==".lzma") { name = name.substr(0, len-5); } return idx.indexChild(name, idx.mTime(), &stream); } }
signed char GZipEndAnalyzer::analyze(AnalysisResult& idx, InputStream* in) { if(!in) return -1; GZipInputStream stream(in); // since this is gzip file, its likely that it contains a tar file const char* start = 0; int32_t nread = stream.read(start, 1024, 0); if (nread < -1) { printf("Error reading gzip: %s\n", stream.error()); return -2; } idx.addValue(factory->typeField, "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Archive"); stream.reset(0); if (TarInputStream::checkHeader(start, nread)) { return TarEndAnalyzer::staticAnalyze(idx, &stream); } else { std::string file = idx.fileName(); size_t len = file.length(); if (len > 3 && file.substr(len-3) == ".gz") { file = file.substr(0, len-3); } signed char r = idx.indexChild(file, idx.mTime(), &stream); idx.finishIndexChild(); return r; } }
bool tryThumbsdbEntry(const string& name, AnalysisResult& ar, InputStream* in) { static const char magic[] = {0x0c, 0, 0, 0, 0x01, 0, 0, 0}; const char* d; uint32_t nread = in->read(d, 12, 12); if (nread != 12 || memcmp(magic, d, 8)) { in->reset(0); return false; } SubInputStream thumb(in, in->size()-12); ar.indexChild(name, 0, &thumb); ar.finishIndexChild(); return true; }
signed char ArEndAnalyzer::analyze(AnalysisResult& idx, InputStream* in) { char result = staticAnalyze(idx, in); idx.addValue(factory->typeField, "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Archive"); return result; }
signed char OleEndAnalyzer::analyze(AnalysisResult& ar, InputStream* in) { if(!in) return -1; result = &ar; OleInputStream ole(in); InputStream *s = ole.nextEntry(); if (ole.status()) { fprintf(stderr, "error: %s\n", ole.error()); return -1; } while (s) { string name = ole.entryInfo().filename; if (name.size()) { char first = name[0]; if (first < 10) { name = name.substr(1); } if (tryFIB(ar, s)) { } else if (tryThumbsdbEntry(name, ar, s)) { } else if (first == 5) { // todo: handle property stream tryPropertyStream(ar, s); } else if (name == "Pictures") { tryPictures(ar, s); //} else if (name == "1Table" || name == "0Table") { // word1Table.assign(getStreamString(s)); } else { ar.indexChild(name, ole.entryInfo().mtime, s); ar.finishIndexChild(); } } s = ole.nextEntry(); } if (ole.status() == Error) { m_error = ole.error(); return -1; } else { ar.addValue(factory->typeField, "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Document"); m_error.resize(0); } return 0; }
/** * Extract images from a 'Pictures' field from a ppt file. * http://jakarta.apache.org/poi/apidocs/org/apache/poi/hslf/model/Picture.html **/ void tryPictures(AnalysisResult& ar, InputStream* in) { const char* d; int32_t nread = in->read(d, 25, 25); ostringstream s; int pos = 1; while (nread == 25) { uint32_t size = readLittleEndianInt32(d+4)-17; SubInputStream sub(in, size); s << "Pictures/" << pos++; ar.indexChild(s.str(), 0, &sub); ar.finishIndexChild(); const char* dummy; while (sub.read(dummy, 1, 0) > 0) { // skip to the end } s.str(""); nread = in->read(d, 25, 25); } }
signed char ArEndAnalyzer::staticAnalyze(AnalysisResult& idx, InputStream* in) { if(!in) return -1; ArInputStream ar(in); // if the first two files are called 'debian-binary' and 'control.tar.gz' // those are analyzed regardless, since they signal that this file is a // debian archive InputStream *s = ar.nextEntry(); if (s && ar.entryInfo().filename.compare("debian-binary") == 0) { idx.indexChild(ar.entryInfo().filename, ar.entryInfo().mtime, s); idx.finishIndexChild(); s = ar.nextEntry(); } if (s && ar.entryInfo().filename.compare("control.tar.gz") == 0) { idx.indexChild(ar.entryInfo().filename, ar.entryInfo().mtime, s); idx.finishIndexChild(); s = ar.nextEntry(); } if (idx.config().indexArchiveContents()) { while (s) { // check if we're done int64_t max = idx.config().maximalStreamReadLength(idx); if (max != -1 && in->position() > max) { return 0; } // check if the analysis has been aborted if (!idx.config().indexMore()) { return 0; } idx.indexChild(ar.entryInfo().filename, ar.entryInfo().mtime, s); idx.finishIndexChild(); s = ar.nextEntry(); } } if (ar.status() == Error) { return -1; // printf("Error: %s\n", ar.error()); } else { // printf("finished ok\n"); } return 0; }
signed char HelperEndAnalyzer::analyze(AnalysisResult& idx, InputStream* in){ if(!in) return -1; signed char state = -1; const char* b; int32_t nread = in->read(b, 1024, 0); in->reset(0); if (nread > 0) { HelperProgramConfig::HelperRecord* h = helperconfig.findHelper(b, nread); if (h) { // fprintf(stderr, "calling %s on %s\n", h->arguments[0].c_str(), // idx.path().c_str()); #if !defined(_WIN32) && !defined(_WIN64) #warning this does not work on windows because processinputstream does not compile! if (h->readfromstdin) { ProcessInputStream pis(h->arguments, in); TextEndAnalyzer t; state = t.analyze(idx, &pis); } else { string filepath; bool fileisondisk = checkForFile(idx); if (fileisondisk) { filepath = idx.path(); } else { filepath = writeToTempFile(in); } vector<string> args = h->arguments; for (uint j=0; j<args.size(); ++j) { if (args[j] == "%s") { args[j] = filepath; } } ProcessInputStream pis(args); TextEndAnalyzer t; state = t.analyze(idx, &pis); if (!fileisondisk) { unlink(filepath.c_str()); } } #endif } } if (in->status() == Error) { m_error = in->error(); state = Error; } return state; }
// parse with info from // http://www.wotsit.org/getfile.asp?file=wword8&sc=230027800 bool OleEndAnalyzer::tryFIB(AnalysisResult& ar, InputStream* in) { const char* d; int32_t size = 426; int32_t nread = in->read(d, size, size); in->reset(0); if (nread != size || (unsigned char)d[0] != 0xec || (unsigned char)d[1] != 0xa5) { return false; } bool complex = (d[10] & 4) == 4; if (complex) return false; int32_t fcMin = readLittleEndianInt32(d+24); int32_t fcMac = readLittleEndianInt32(d+28); // for some reason we sometimes need to add 512 here. No clue why. // if the first 512 bytes are 0 we do this size = fcMin+512; nread = in->read(d, size, size); in->reset(0); if (nread != size) { return false; } int i; for (i=0; i<512 && d[i+fcMin] == 0; i++) ; if (i == 512) { fcMin += 512; fcMac += 512; } size = fcMac; nread = in->read(d, size, size); in->reset(0); if (nread != size) { return false; } wordtext.reset(); for (int32_t dp = fcMin; dp < fcMac; dp += size) { size = fcMac-dp; if (size > 512) size = 512; wordtext.addText(d+dp, size); } wordtext.cleanText(); ar.addText(wordtext.text(), (int32_t)wordtext.length()); wordtext.reset(); return true; }
signed char OdfEndAnalyzer::analyze(AnalysisResult& idx, InputStream* in) { if(!in) return -1; ZipInputStream zip(in); InputStream *s = zip.nextEntry(); if (zip.status() != Ok) { m_error = zip.error(); return -1; } while (s) { // check if we're done int64_t max = idx.config().maximalStreamReadLength(idx); if (max != -1 && in->position() > max) { return 0; } // check if the analysis has been aborted if (!idx.config().indexMore()) { return 0; } if (zip.entryInfo().filename == "mimetype") { const char *buf; size_t nread; if ((nread = s->read(buf, 47, 47)) < 39) return -1; if (strncmp(buf, "application/vnd.oasis.opendocument.", 35)) return -1; const char *rdftype; buf += 35; if( nread >= (35+4) && strncmp(buf, "text", 4) == 0 ) { rdftype = NFO "PaginatedTextDocument"; } else if ( nread >= (35+12) && strncmp(buf, "presentation", 12) == 0 ) { rdftype = NFO "Presentation"; } else if ( nread >= (35+11) && strncmp(buf, "spreadsheet", 11) == 0 ) { rdftype = NFO "Spreadsheet"; } else rdftype = NFO "Document"; idx.addValue(factory->typeField, rdftype); } else if (zip.entryInfo().filename == "meta.xml") { metaHelper.analyze(idx, s); } else if (zip.entryInfo().filename == "content.xml") { contentHelper.analyze(idx,s); } else if (zip.entryInfo().filename.substr(0,9) == "Pictures/") { idx.indexChild(zip.entryInfo().filename, zip.entryInfo().mtime, s); idx.finishIndexChild(); } s = zip.nextEntry(); } if (zip.status() == Error) { m_error = zip.error(); return -1; } else { m_error.resize(0); } return 0; }
signed char StreamAnalyzerPrivate::analyze(AnalysisResult& idx, StreamBase<char>* input) { //cerr << "analyze " << idx.path().c_str() << endl; // retrieve or construct the through analyzers and end analyzers vector<vector<StreamThroughAnalyzer*> >::iterator tIter; vector<vector<StreamEndAnalyzer*> >::iterator eIter; while ((int)through.size() <= idx.depth()) { addThroughAnalyzers(); addEndAnalyzers(); } tIter = through.begin() + idx.depth(); eIter = end.begin() + idx.depth(); // read the headersize size before connecting the throughanalyzers // This ensures that the first read is at least this size, even if the // throughanalyzers read smaller chunks. bool finished = false; const char* header = 0; int32_t headersize = 1024; if (input) { headersize = input->read(header, headersize, headersize); input->reset(0); if (headersize < 0) finished = true; } // insert the through analyzers vector<StreamThroughAnalyzer*>::iterator ts; for (ts = tIter->begin(); (input == 0 || input->status() == Ok) && ts != tIter->end(); ++ts) { (*ts)->setIndexable(&idx); input = (*ts)->connectInputStream(input); if (input && input->position() != 0) { cerr << "Analyzer " << (*ts)->name() << " has left the stream in a bad state." << endl; } } // reread the header so we can use it for the endanalyzers if (input && headersize > 0) { headersize = input->read(header, headersize, headersize); if (headersize <= 0) { finished = true; } else if (input->reset(0) != 0) { cerr << "resetting is impossible!! pos: " << input->position() << " status: " << input->status() << endl; } } else { // indicate that we have no data in the stream headersize = -1; finished = true; } size_t es = 0; size_t itersize = eIter->size(); while (!finished && es != itersize) { StreamEndAnalyzer* sea = (*eIter)[es]; if (sea->checkHeader(header, headersize)) { idx.setEndAnalyzer(sea); char ar = sea->analyze(idx, input); if (ar) { // FIXME: find either a NIE-compliant way to report errors or use some API for this // idx.addValue(errorfield, sea->name() + string(": ") // + sea->error()); if (!idx.config().indexMore()) { removeIndexable(idx.depth()); return -1; } int64_t pos = input->reset(0); if (pos != 0) { // could not reset cerr << "could not reset stream of " << idx.path().c_str() << " from pos " << input->position() << " to 0 after reading with " << sea->name() << ": " << sea->error().c_str() << endl; finished = true; } else { // refresh the pointer to the start of the data headersize = input->read(header, headersize, headersize); if (input->reset(0) != 0) { cerr << "resetting again is impossible!! pos: " << input->position() << " status: " << input->status() << endl; } if (headersize < 0) finished = true; } } else { finished = true; } eIter = end.begin() + idx.depth(); } if (!finished) { finished = !conf.indexMore(); } es++; } idx.setEndAnalyzer(0); if (input) { // make sure the entire stream is read if the size is not known bool ready; tIter = through.begin() + idx.depth(); uint32_t skipsize = 4096; do { // ask the analyzerconfiguration if we should continue int64_t max = idx.config().maximalStreamReadLength(idx); if (!idx.config().indexMore() || (max != -1 && input->position() >= max)) { // we are done return 0; } ready = input->size() != -1; vector<StreamThroughAnalyzer*>::iterator ts; for (ts = tIter->begin(); ready && ts != tIter->end(); ++ts) { ready = (*ts)->isReadyWithStream(); } if (!ready) { input->skip(skipsize); if (skipsize < 131072) { skipsize *= 4; } } } while (!ready && input->status() == Ok); if (input->status() == Error) { fprintf(stderr, "Error: %s\n", input->error()); removeIndexable(idx.depth()); return -2; } } // store the size of the stream if (input && input->status() != Error && input->size() >= 0) { // TODO remove cast idx.addValue(sizefield, (uint32_t)input->size()); } // remove references to the analysisresult before it goes out of scope removeIndexable(idx.depth()); return 0; }
signed char IFilterEndAnalyzer::analyze(AnalysisResult& idx, InputStream *in) { const string& filename = idx.fileName(); int p = filename.find_last_of('.'); if (p < 0 || extensions.find(filename.substr(p)) == extensions.end()) { return -1; } string filepath; bool fileisondisk = checkForFile(idx.depth(), filename); if (fileisondisk) { filepath = filename; } else { int p = filename.find_last_of("."); if ( p > 0 ){ string ext = filename.substr(p).c_str(); strlwr((char*)ext.c_str()); p = ext.find_first_not_of("._abcdefghijklmnopqrstuvwxyz0123456789"); if ( p >= 0 ) filepath = writeToTempFile(in, ""); else filepath = writeToTempFile(in, ext.c_str()); }else filepath = writeToTempFile(in, ""); } if (filepath.length() > 0) { IFilter* filter = NULL; void* pvfilter=NULL; wchar_t tmp[MAX_PATH]; _cpycharToWide(tmp,filepath.c_str(),MAX_PATH); HRESULT hr = LoadIFilter(tmp,NULL,&pvfilter); if (hr == S_OK) { filter = (IFilter*)pvfilter; ULONG __i=0; hr = filter->Init(IFILTER_INIT_APPLY_INDEX_ATTRIBUTES,0,NULL,&__i); if (FAILED( hr )) { if (!fileisondisk) unlink(filepath.c_str()); return -1; } const int sbBufferLen = 1024; wchar_t sbBuffer[sbBufferLen]; STAT_CHUNK ps; hr = filter->GetChunk(&ps); while ( SUCCEEDED(hr) ) { if (ps.flags == CHUNK_TEXT) { int resultText = 0; while ( resultText >= 0 ) { ULONG sizeBuffer=sbBufferLen; resultText = filter->GetText(&sizeBuffer, sbBuffer); if (sizeBuffer > 0 ) { string str = wchartoutf8(sbBuffer,sbBuffer+sizeBuffer); idx.addText(str.c_str(),str.length()); } } } else if ( ps.flags == CHUNK_VALUE ) { PROPVARIANT *pVar; while ( SUCCEEDED( hr = filter->GetValue( &pVar ) ) ) { //printf("propid: %d\nkind:%d\n",ps.attribute.psProperty.propid,ps.attribute.psProperty.ulKind); if ( ps.attribute.psProperty.propid == 2 && ps.attribute.psProperty.ulKind == 1 && pVar->vt == VT_LPWSTR ) { string str = wchartoutf8(pVar->pwszVal,pVar->pwszVal+wcslen(pVar->pwszVal)); idx.addValue("title", str ); } PropVariantClear( pVar ); CoTaskMemFree( pVar ); } } else { printf("other flag %d\n",ps.flags); } hr = filter->GetChunk(&ps); } filter->Release(); if (!fileisondisk) unlink(filepath.c_str()); return 0; } DWORD dw = GetLastError(); if ( dw != 0 ) { LPVOID lpMsgBuf; FormatMessage( FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM, NULL, dw, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPTSTR) &lpMsgBuf, 0, NULL ); wprintf(L"%s\n", lpMsgBuf); LocalFree(lpMsgBuf); } } if (!fileisondisk && filepath.length()>0) { unlink(filepath.c_str()); } return -1; }