예제 #1
0
signed char
GZipEndAnalyzer::analyze(AnalysisResult& idx, InputStream* in) {
    if(!in)
        return -1;

    GZipInputStream stream(in);
    // since this is gzip file, its likely that it contains a tar file
    const char* start = 0;
    int32_t nread = stream.read(start, 1024, 0);
    if (nread < -1) {
        printf("Error reading gzip: %s\n", stream.error());
        return -2;
    }

    idx.addValue(factory->typeField, "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Archive");

    stream.reset(0);
    if (TarInputStream::checkHeader(start, nread)) {
        return TarEndAnalyzer::staticAnalyze(idx, &stream);
    } else {
        std::string file = idx.fileName();
        size_t len = file.length();
        if (len > 3 && file.substr(len-3) == ".gz") {
            file = file.substr(0, len-3);
        }
        signed char r = idx.indexChild(file, idx.mTime(), &stream);
        idx.finishIndexChild();
        return r;
    }
}
예제 #2
0
signed char
Bz2EndAnalyzer::analyze(AnalysisResult& idx, InputStream* in) {
    if(!in)
        return -1;

    BZ2InputStream stream(in);
/*    char r = testStream(&stream);
    if (r) {
        return r;
    }*/
    // since this is bz2 file, its likely that it contains a tar file
    const char* start = 0;
    int32_t nread = stream.read(start, 1024, 0);
    if (nread < -1) {
        fprintf(stderr, "Error reading bz2: %s\n", stream.error());
        return -2;
    }
    idx.addValue(factory->typeField, "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Archive");
    stream.reset(0);
    if (TarInputStream::checkHeader(start, nread)) {
        return TarEndAnalyzer::staticAnalyze(idx, &stream);
    } else {
        std::string name = idx.fileName();
        size_t len = name.length();
        if (len > 4 && name.substr(len-4)==".bz2") {
            name = name.substr(0, len-4);
        }
        signed char r = idx.indexChild(name, idx.mTime(), &stream);
        idx.finishIndexChild();
        return r;
    }
}
예제 #3
0
signed char
ArEndAnalyzer::staticAnalyze(AnalysisResult& idx,
        InputStream* in) {
    if(!in)
        return -1;

    ArInputStream ar(in);
    // if the first two files are called 'debian-binary' and 'control.tar.gz'
    // those are analyzed regardless, since they signal that this file is a 
    // debian archive
    InputStream *s = ar.nextEntry();
    if (s && ar.entryInfo().filename.compare("debian-binary") == 0) {
        idx.indexChild(ar.entryInfo().filename, ar.entryInfo().mtime, s);
        idx.finishIndexChild();
        s = ar.nextEntry();
    }
    if (s && ar.entryInfo().filename.compare("control.tar.gz") == 0) {
        idx.indexChild(ar.entryInfo().filename, ar.entryInfo().mtime, s);
        idx.finishIndexChild();
        s = ar.nextEntry();
    }
    if (idx.config().indexArchiveContents()) {
        while (s) {
            // check if we're done
            int64_t max = idx.config().maximalStreamReadLength(idx);
            if (max != -1 && in->position() > max) {
                return 0;
            }
            // check if the analysis has been aborted
            if (!idx.config().indexMore()) {
                return 0;
            }
            idx.indexChild(ar.entryInfo().filename, ar.entryInfo().mtime, s);
            idx.finishIndexChild();
            s = ar.nextEntry();
        }
    }
    if (ar.status() == Error) {
        return -1;
//        printf("Error: %s\n", ar.error());
    } else {
//        printf("finished ok\n");
    }
    return 0;
}
예제 #4
0
bool
tryThumbsdbEntry(const string& name, AnalysisResult& ar, InputStream* in) {
    static const char magic[] = {0x0c, 0, 0, 0, 0x01, 0, 0, 0};
    const char* d;
    uint32_t nread = in->read(d, 12, 12);
    if (nread != 12 || memcmp(magic, d, 8)) {
        in->reset(0);
        return false;
    }
    SubInputStream thumb(in, in->size()-12);
    ar.indexChild(name, 0, &thumb);
    ar.finishIndexChild();
    return true;
}
예제 #5
0
signed char
OleEndAnalyzer::analyze(AnalysisResult& ar, InputStream* in) {
    if(!in)
        return -1;

    result = &ar;
    OleInputStream ole(in);
    InputStream *s = ole.nextEntry();
    if (ole.status()) {
        fprintf(stderr, "error: %s\n", ole.error());
	return -1;
    }
    while (s) {
        string name = ole.entryInfo().filename;
        if (name.size()) {
            char first = name[0];
            if (first < 10) {
                name = name.substr(1);
            }
	    if (tryFIB(ar, s)) {
            } else if (tryThumbsdbEntry(name, ar, s)) {
            } else if (first == 5) {
                // todo: handle property stream
                tryPropertyStream(ar, s);
            } else if (name == "Pictures") {
                tryPictures(ar, s);
            //} else if (name == "1Table" || name == "0Table") {
            //    word1Table.assign(getStreamString(s));
            } else {
                ar.indexChild(name, ole.entryInfo().mtime, s);
                ar.finishIndexChild();
            }
        }
        s = ole.nextEntry();
    }
    if (ole.status() == Error) {
        m_error = ole.error();
        return -1;
    } else {
        ar.addValue(factory->typeField, "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Document");
        m_error.resize(0);
    }
    return 0;
}
예제 #6
0
/**
 * Extract images from a 'Pictures' field from a ppt file.
 * http://jakarta.apache.org/poi/apidocs/org/apache/poi/hslf/model/Picture.html
 **/
void
tryPictures(AnalysisResult& ar, InputStream* in) {
    const char* d;
    int32_t nread = in->read(d, 25, 25);
    ostringstream s;
    int pos = 1;
    while (nread == 25) {
        uint32_t size = readLittleEndianInt32(d+4)-17;
        SubInputStream sub(in, size);
        s << "Pictures/" << pos++;
        ar.indexChild(s.str(), 0, &sub);
        ar.finishIndexChild();
        const char* dummy;
        while (sub.read(dummy, 1, 0) > 0) {
            // skip to the end
        }
        s.str("");
        nread = in->read(d, 25, 25);
    }
}
예제 #7
0
signed char
OdfEndAnalyzer::analyze(AnalysisResult& idx, InputStream* in) {
    if(!in)
        return -1;

    ZipInputStream zip(in);
    InputStream *s = zip.nextEntry();
    if (zip.status() != Ok) {
        m_error = zip.error();
        return -1;
    }

    while (s) {
	// check if we're done
	int64_t max = idx.config().maximalStreamReadLength(idx);
	if (max != -1 && in->position() > max) {
	    return 0;
	}
	// check if the analysis has been aborted
	if (!idx.config().indexMore()) {
	    return 0;
	}
	if (zip.entryInfo().filename == "mimetype") {
	    const char *buf;
	    size_t nread;
	    
	    if ((nread = s->read(buf, 47, 47)) < 39)
		return -1;
	    if (strncmp(buf, "application/vnd.oasis.opendocument.", 35))
		return -1;
		
	    const char *rdftype;
	    buf += 35;
	    if( nread >= (35+4) && strncmp(buf, "text", 4) == 0 ) {
		rdftype = NFO "PaginatedTextDocument";
	    } else if ( nread >= (35+12) && strncmp(buf, "presentation", 12) == 0 ) {
		rdftype = NFO "Presentation";
	    } else if ( nread >= (35+11) && strncmp(buf, "spreadsheet", 11) == 0 ) {
		rdftype = NFO "Spreadsheet";
	    } else rdftype = NFO "Document";

	    idx.addValue(factory->typeField, rdftype);
	    
	} else if (zip.entryInfo().filename == "meta.xml") {
	    metaHelper.analyze(idx, s);
	} else if (zip.entryInfo().filename == "content.xml") {
	    contentHelper.analyze(idx,s);
	} else if (zip.entryInfo().filename.substr(0,9) == "Pictures/") {
	    idx.indexChild(zip.entryInfo().filename, zip.entryInfo().mtime, s);
            idx.finishIndexChild();
	}
	s = zip.nextEntry();
    }
    if (zip.status() == Error) {
        m_error = zip.error();
        return -1;
    } else {
        m_error.resize(0);
    }
    return 0;
}