signed char GZipEndAnalyzer::analyze(AnalysisResult& idx, InputStream* in) { if(!in) return -1; GZipInputStream stream(in); // since this is gzip file, its likely that it contains a tar file const char* start = 0; int32_t nread = stream.read(start, 1024, 0); if (nread < -1) { printf("Error reading gzip: %s\n", stream.error()); return -2; } idx.addValue(factory->typeField, "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Archive"); stream.reset(0); if (TarInputStream::checkHeader(start, nread)) { return TarEndAnalyzer::staticAnalyze(idx, &stream); } else { std::string file = idx.fileName(); size_t len = file.length(); if (len > 3 && file.substr(len-3) == ".gz") { file = file.substr(0, len-3); } signed char r = idx.indexChild(file, idx.mTime(), &stream); idx.finishIndexChild(); return r; } }
signed char Bz2EndAnalyzer::analyze(AnalysisResult& idx, InputStream* in) { if(!in) return -1; BZ2InputStream stream(in); /* char r = testStream(&stream); if (r) { return r; }*/ // since this is bz2 file, its likely that it contains a tar file const char* start = 0; int32_t nread = stream.read(start, 1024, 0); if (nread < -1) { fprintf(stderr, "Error reading bz2: %s\n", stream.error()); return -2; } idx.addValue(factory->typeField, "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Archive"); stream.reset(0); if (TarInputStream::checkHeader(start, nread)) { return TarEndAnalyzer::staticAnalyze(idx, &stream); } else { std::string name = idx.fileName(); size_t len = name.length(); if (len > 4 && name.substr(len-4)==".bz2") { name = name.substr(0, len-4); } signed char r = idx.indexChild(name, idx.mTime(), &stream); idx.finishIndexChild(); return r; } }
signed char ArEndAnalyzer::staticAnalyze(AnalysisResult& idx, InputStream* in) { if(!in) return -1; ArInputStream ar(in); // if the first two files are called 'debian-binary' and 'control.tar.gz' // those are analyzed regardless, since they signal that this file is a // debian archive InputStream *s = ar.nextEntry(); if (s && ar.entryInfo().filename.compare("debian-binary") == 0) { idx.indexChild(ar.entryInfo().filename, ar.entryInfo().mtime, s); idx.finishIndexChild(); s = ar.nextEntry(); } if (s && ar.entryInfo().filename.compare("control.tar.gz") == 0) { idx.indexChild(ar.entryInfo().filename, ar.entryInfo().mtime, s); idx.finishIndexChild(); s = ar.nextEntry(); } if (idx.config().indexArchiveContents()) { while (s) { // check if we're done int64_t max = idx.config().maximalStreamReadLength(idx); if (max != -1 && in->position() > max) { return 0; } // check if the analysis has been aborted if (!idx.config().indexMore()) { return 0; } idx.indexChild(ar.entryInfo().filename, ar.entryInfo().mtime, s); idx.finishIndexChild(); s = ar.nextEntry(); } } if (ar.status() == Error) { return -1; // printf("Error: %s\n", ar.error()); } else { // printf("finished ok\n"); } return 0; }
bool tryThumbsdbEntry(const string& name, AnalysisResult& ar, InputStream* in) { static const char magic[] = {0x0c, 0, 0, 0, 0x01, 0, 0, 0}; const char* d; uint32_t nread = in->read(d, 12, 12); if (nread != 12 || memcmp(magic, d, 8)) { in->reset(0); return false; } SubInputStream thumb(in, in->size()-12); ar.indexChild(name, 0, &thumb); ar.finishIndexChild(); return true; }
signed char OleEndAnalyzer::analyze(AnalysisResult& ar, InputStream* in) { if(!in) return -1; result = &ar; OleInputStream ole(in); InputStream *s = ole.nextEntry(); if (ole.status()) { fprintf(stderr, "error: %s\n", ole.error()); return -1; } while (s) { string name = ole.entryInfo().filename; if (name.size()) { char first = name[0]; if (first < 10) { name = name.substr(1); } if (tryFIB(ar, s)) { } else if (tryThumbsdbEntry(name, ar, s)) { } else if (first == 5) { // todo: handle property stream tryPropertyStream(ar, s); } else if (name == "Pictures") { tryPictures(ar, s); //} else if (name == "1Table" || name == "0Table") { // word1Table.assign(getStreamString(s)); } else { ar.indexChild(name, ole.entryInfo().mtime, s); ar.finishIndexChild(); } } s = ole.nextEntry(); } if (ole.status() == Error) { m_error = ole.error(); return -1; } else { ar.addValue(factory->typeField, "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Document"); m_error.resize(0); } return 0; }
/** * Extract images from a 'Pictures' field from a ppt file. * http://jakarta.apache.org/poi/apidocs/org/apache/poi/hslf/model/Picture.html **/ void tryPictures(AnalysisResult& ar, InputStream* in) { const char* d; int32_t nread = in->read(d, 25, 25); ostringstream s; int pos = 1; while (nread == 25) { uint32_t size = readLittleEndianInt32(d+4)-17; SubInputStream sub(in, size); s << "Pictures/" << pos++; ar.indexChild(s.str(), 0, &sub); ar.finishIndexChild(); const char* dummy; while (sub.read(dummy, 1, 0) > 0) { // skip to the end } s.str(""); nread = in->read(d, 25, 25); } }
signed char OdfEndAnalyzer::analyze(AnalysisResult& idx, InputStream* in) { if(!in) return -1; ZipInputStream zip(in); InputStream *s = zip.nextEntry(); if (zip.status() != Ok) { m_error = zip.error(); return -1; } while (s) { // check if we're done int64_t max = idx.config().maximalStreamReadLength(idx); if (max != -1 && in->position() > max) { return 0; } // check if the analysis has been aborted if (!idx.config().indexMore()) { return 0; } if (zip.entryInfo().filename == "mimetype") { const char *buf; size_t nread; if ((nread = s->read(buf, 47, 47)) < 39) return -1; if (strncmp(buf, "application/vnd.oasis.opendocument.", 35)) return -1; const char *rdftype; buf += 35; if( nread >= (35+4) && strncmp(buf, "text", 4) == 0 ) { rdftype = NFO "PaginatedTextDocument"; } else if ( nread >= (35+12) && strncmp(buf, "presentation", 12) == 0 ) { rdftype = NFO "Presentation"; } else if ( nread >= (35+11) && strncmp(buf, "spreadsheet", 11) == 0 ) { rdftype = NFO "Spreadsheet"; } else rdftype = NFO "Document"; idx.addValue(factory->typeField, rdftype); } else if (zip.entryInfo().filename == "meta.xml") { metaHelper.analyze(idx, s); } else if (zip.entryInfo().filename == "content.xml") { contentHelper.analyze(idx,s); } else if (zip.entryInfo().filename.substr(0,9) == "Pictures/") { idx.indexChild(zip.entryInfo().filename, zip.entryInfo().mtime, s); idx.finishIndexChild(); } s = zip.nextEntry(); } if (zip.status() == Error) { m_error = zip.error(); return -1; } else { m_error.resize(0); } return 0; }