bool
HelperEndAnalyzer::checkForFile(const AnalysisResult& idx) const {
    if (idx.depth() > 0) return false;
    struct stat s;
    if (stat(idx.path().c_str(), &s)) return false;
    return true;
}
signed char
LzmaEndAnalyzer::analyze(AnalysisResult& idx, InputStream* in) {
    if(!in)
        return -1;

    LZMAInputStream stream(in);
    // since this is lzma file, its likely that it contains a tar file
    const char* start = 0;
    int32_t nread = stream.read(start, 1024, 0);
    if (nread < -1) {
        fprintf(stderr, "Error reading lzma: %s\n", stream.error());
        return -2;
    }
    idx.addValue(factory->typeField,
        "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Archive");
    stream.reset(0);
    if (TarInputStream::checkHeader(start, nread)) {
        return TarEndAnalyzer::staticAnalyze(idx, &stream);
    } else {
        std::string name = idx.fileName();
        string::size_type len = name.length();
        if (len > 5 && name.substr(len-5)==".lzma") {
            name = name.substr(0, len-5);
        }
        return idx.indexChild(name, idx.mTime(), &stream);
    }
}
signed char
GZipEndAnalyzer::analyze(AnalysisResult& idx, InputStream* in) {
    if(!in)
        return -1;

    GZipInputStream stream(in);
    // since this is gzip file, its likely that it contains a tar file
    const char* start = 0;
    int32_t nread = stream.read(start, 1024, 0);
    if (nread < -1) {
        printf("Error reading gzip: %s\n", stream.error());
        return -2;
    }

    idx.addValue(factory->typeField, "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Archive");

    stream.reset(0);
    if (TarInputStream::checkHeader(start, nread)) {
        return TarEndAnalyzer::staticAnalyze(idx, &stream);
    } else {
        std::string file = idx.fileName();
        size_t len = file.length();
        if (len > 3 && file.substr(len-3) == ".gz") {
            file = file.substr(0, len-3);
        }
        signed char r = idx.indexChild(file, idx.mTime(), &stream);
        idx.finishIndexChild();
        return r;
    }
}
bool
tryThumbsdbEntry(const string& name, AnalysisResult& ar, InputStream* in) {
    static const char magic[] = {0x0c, 0, 0, 0, 0x01, 0, 0, 0};
    const char* d;
    uint32_t nread = in->read(d, 12, 12);
    if (nread != 12 || memcmp(magic, d, 8)) {
        in->reset(0);
        return false;
    }
    SubInputStream thumb(in, in->size()-12);
    ar.indexChild(name, 0, &thumb);
    ar.finishIndexChild();
    return true;
}
signed char
ArEndAnalyzer::analyze(AnalysisResult& idx, InputStream* in) {
    char result = staticAnalyze(idx, in);
    idx.addValue(factory->typeField,
        "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Archive");
    return  result;
}
signed char
OleEndAnalyzer::analyze(AnalysisResult& ar, InputStream* in) {
    if(!in)
        return -1;

    result = &ar;
    OleInputStream ole(in);
    InputStream *s = ole.nextEntry();
    if (ole.status()) {
        fprintf(stderr, "error: %s\n", ole.error());
	return -1;
    }
    while (s) {
        string name = ole.entryInfo().filename;
        if (name.size()) {
            char first = name[0];
            if (first < 10) {
                name = name.substr(1);
            }
	    if (tryFIB(ar, s)) {
            } else if (tryThumbsdbEntry(name, ar, s)) {
            } else if (first == 5) {
                // todo: handle property stream
                tryPropertyStream(ar, s);
            } else if (name == "Pictures") {
                tryPictures(ar, s);
            //} else if (name == "1Table" || name == "0Table") {
            //    word1Table.assign(getStreamString(s));
            } else {
                ar.indexChild(name, ole.entryInfo().mtime, s);
                ar.finishIndexChild();
            }
        }
        s = ole.nextEntry();
    }
    if (ole.status() == Error) {
        m_error = ole.error();
        return -1;
    } else {
        ar.addValue(factory->typeField, "http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Document");
        m_error.resize(0);
    }
    return 0;
}
/**
 * Extract images from a 'Pictures' field from a ppt file.
 * http://jakarta.apache.org/poi/apidocs/org/apache/poi/hslf/model/Picture.html
 **/
void
tryPictures(AnalysisResult& ar, InputStream* in) {
    const char* d;
    int32_t nread = in->read(d, 25, 25);
    ostringstream s;
    int pos = 1;
    while (nread == 25) {
        uint32_t size = readLittleEndianInt32(d+4)-17;
        SubInputStream sub(in, size);
        s << "Pictures/" << pos++;
        ar.indexChild(s.str(), 0, &sub);
        ar.finishIndexChild();
        const char* dummy;
        while (sub.read(dummy, 1, 0) > 0) {
            // skip to the end
        }
        s.str("");
        nread = in->read(d, 25, 25);
    }
}
signed char
ArEndAnalyzer::staticAnalyze(AnalysisResult& idx,
        InputStream* in) {
    if(!in)
        return -1;

    ArInputStream ar(in);
    // if the first two files are called 'debian-binary' and 'control.tar.gz'
    // those are analyzed regardless, since they signal that this file is a 
    // debian archive
    InputStream *s = ar.nextEntry();
    if (s && ar.entryInfo().filename.compare("debian-binary") == 0) {
        idx.indexChild(ar.entryInfo().filename, ar.entryInfo().mtime, s);
        idx.finishIndexChild();
        s = ar.nextEntry();
    }
    if (s && ar.entryInfo().filename.compare("control.tar.gz") == 0) {
        idx.indexChild(ar.entryInfo().filename, ar.entryInfo().mtime, s);
        idx.finishIndexChild();
        s = ar.nextEntry();
    }
    if (idx.config().indexArchiveContents()) {
        while (s) {
            // check if we're done
            int64_t max = idx.config().maximalStreamReadLength(idx);
            if (max != -1 && in->position() > max) {
                return 0;
            }
            // check if the analysis has been aborted
            if (!idx.config().indexMore()) {
                return 0;
            }
            idx.indexChild(ar.entryInfo().filename, ar.entryInfo().mtime, s);
            idx.finishIndexChild();
            s = ar.nextEntry();
        }
    }
    if (ar.status() == Error) {
        return -1;
//        printf("Error: %s\n", ar.error());
    } else {
//        printf("finished ok\n");
    }
    return 0;
}
signed char
HelperEndAnalyzer::analyze(AnalysisResult& idx, InputStream* in){
    if(!in)
        return -1;

    signed char state = -1;
    const char* b;
    int32_t nread = in->read(b, 1024, 0);
    in->reset(0);
    if (nread > 0) {
        HelperProgramConfig::HelperRecord* h
            = helperconfig.findHelper(b, nread);
        if (h) {
//            fprintf(stderr, "calling %s on %s\n", h->arguments[0].c_str(),
//                idx.path().c_str());
#if !defined(_WIN32) && !defined(_WIN64)
#warning this does not work on windows because processinputstream does not compile!
            if (h->readfromstdin) {
                ProcessInputStream pis(h->arguments, in);
                TextEndAnalyzer t;
                state = t.analyze(idx, &pis);
            } else {
                string filepath;
                bool fileisondisk = checkForFile(idx);
                if (fileisondisk) {
                    filepath = idx.path();
                } else {
                    filepath = writeToTempFile(in);
                }
                vector<string> args = h->arguments;
                for (uint j=0; j<args.size(); ++j) {
                    if (args[j] == "%s") {
                        args[j] = filepath;
                    }
                }
                ProcessInputStream pis(args);
                TextEndAnalyzer t;
                state = t.analyze(idx, &pis);

                if (!fileisondisk) {
                    unlink(filepath.c_str());
                }
            }
#endif
        }
    }
    if (in->status() == Error) {
        m_error = in->error();
        state = Error;
    }
    return state;
}
// parse with info from
// http://www.wotsit.org/getfile.asp?file=wword8&sc=230027800
bool
OleEndAnalyzer::tryFIB(AnalysisResult& ar, InputStream* in) {
    const char* d;
    int32_t size = 426;
    int32_t nread = in->read(d, size, size);
    in->reset(0);
    if (nread != size
            || (unsigned char)d[0] != 0xec || (unsigned char)d[1] != 0xa5) {
        return false;
    }
    bool complex = (d[10] & 4) == 4;
    if (complex) return false;
    int32_t fcMin = readLittleEndianInt32(d+24);
    int32_t fcMac = readLittleEndianInt32(d+28);

    // for some reason we sometimes need to add 512 here. No clue why.
    // if the first 512 bytes are 0 we do this
    size = fcMin+512;
    nread = in->read(d, size, size);
    in->reset(0);
    if (nread != size) {
        return false;
    }
    int i;
    for (i=0; i<512 && d[i+fcMin] == 0; i++) ;
    if (i == 512) {
        fcMin += 512;
        fcMac += 512;
    }

    size = fcMac;
    nread = in->read(d, size, size);
    in->reset(0);
    if (nread != size) {
        return false;
    }

    wordtext.reset();
    for (int32_t dp = fcMin; dp < fcMac; dp += size) {
        size = fcMac-dp;
        if (size > 512) size = 512;
        wordtext.addText(d+dp, size);
    }
    wordtext.cleanText();
    ar.addText(wordtext.text(), (int32_t)wordtext.length());
    wordtext.reset();
    return true;
}
signed char
OdfEndAnalyzer::analyze(AnalysisResult& idx, InputStream* in) {
    if(!in)
        return -1;

    ZipInputStream zip(in);
    InputStream *s = zip.nextEntry();
    if (zip.status() != Ok) {
        m_error = zip.error();
        return -1;
    }

    while (s) {
	// check if we're done
	int64_t max = idx.config().maximalStreamReadLength(idx);
	if (max != -1 && in->position() > max) {
	    return 0;
	}
	// check if the analysis has been aborted
	if (!idx.config().indexMore()) {
	    return 0;
	}
	if (zip.entryInfo().filename == "mimetype") {
	    const char *buf;
	    size_t nread;
	    
	    if ((nread = s->read(buf, 47, 47)) < 39)
		return -1;
	    if (strncmp(buf, "application/vnd.oasis.opendocument.", 35))
		return -1;
		
	    const char *rdftype;
	    buf += 35;
	    if( nread >= (35+4) && strncmp(buf, "text", 4) == 0 ) {
		rdftype = NFO "PaginatedTextDocument";
	    } else if ( nread >= (35+12) && strncmp(buf, "presentation", 12) == 0 ) {
		rdftype = NFO "Presentation";
	    } else if ( nread >= (35+11) && strncmp(buf, "spreadsheet", 11) == 0 ) {
		rdftype = NFO "Spreadsheet";
	    } else rdftype = NFO "Document";

	    idx.addValue(factory->typeField, rdftype);
	    
	} else if (zip.entryInfo().filename == "meta.xml") {
	    metaHelper.analyze(idx, s);
	} else if (zip.entryInfo().filename == "content.xml") {
	    contentHelper.analyze(idx,s);
	} else if (zip.entryInfo().filename.substr(0,9) == "Pictures/") {
	    idx.indexChild(zip.entryInfo().filename, zip.entryInfo().mtime, s);
            idx.finishIndexChild();
	}
	s = zip.nextEntry();
    }
    if (zip.status() == Error) {
        m_error = zip.error();
        return -1;
    } else {
        m_error.resize(0);
    }
    return 0;
}
signed char
StreamAnalyzerPrivate::analyze(AnalysisResult& idx, StreamBase<char>* input) {
    //cerr << "analyze " << idx.path().c_str() << endl;

    // retrieve or construct the through analyzers and end analyzers
    vector<vector<StreamThroughAnalyzer*> >::iterator tIter;
    vector<vector<StreamEndAnalyzer*> >::iterator eIter;
    while ((int)through.size() <= idx.depth()) {
        addThroughAnalyzers();
        addEndAnalyzers();
    }
    tIter = through.begin() + idx.depth();
    eIter = end.begin() + idx.depth();

    // read the headersize size before connecting the throughanalyzers
    // This ensures that the first read is at least this size, even if the
    // throughanalyzers read smaller chunks.
    bool finished = false;
    const char* header = 0;
    int32_t headersize = 1024;
    if (input) {
        headersize = input->read(header, headersize, headersize);
        input->reset(0);
        if (headersize < 0) finished = true;
    }

    // insert the through analyzers
    vector<StreamThroughAnalyzer*>::iterator ts;
    for (ts = tIter->begin(); (input == 0 || input->status() == Ok)
            && ts != tIter->end(); ++ts) {
        (*ts)->setIndexable(&idx);
        input = (*ts)->connectInputStream(input);
        if (input && input->position() != 0) {
            cerr << "Analyzer " << (*ts)->name() << " has left the stream in a bad state." << endl;
        }
    }

    // reread the header so we can use it for the endanalyzers
    if (input && headersize > 0) {
        headersize = input->read(header, headersize, headersize);
        if (headersize <= 0) {
            finished = true;
        } else if (input->reset(0) != 0) {
            cerr << "resetting is impossible!! pos: " << input->position()
                << " status: " << input->status() << endl;
        }
    } else {
        // indicate that we have no data in the stream
        headersize = -1;
        finished = true;
    }
    size_t es = 0;
    size_t itersize = eIter->size();
    while (!finished && es != itersize) {
        StreamEndAnalyzer* sea = (*eIter)[es];
        if (sea->checkHeader(header, headersize)) {
            idx.setEndAnalyzer(sea);
            char ar = sea->analyze(idx, input);
            if (ar) {
// FIXME: find either a NIE-compliant way to report errors or use some API for this
//                idx.addValue(errorfield, sea->name() + string(": ")
//                    + sea->error());
                if (!idx.config().indexMore()) {
                    removeIndexable(idx.depth());
                    return -1;
                }
                int64_t pos = input->reset(0);
                if (pos != 0) { // could not reset
                    cerr << "could not reset stream of " << idx.path().c_str()
                        << " from pos " << input->position()
                        << " to 0 after reading with " << sea->name()
                        << ": " << sea->error().c_str() << endl;
                    finished = true;
                } else {
                    // refresh the pointer to the start of the data
                    headersize = input->read(header, headersize, headersize);
    		    if (input->reset(0) != 0) {
        		cerr << "resetting again is impossible!! pos: "
                             << input->position() << " status: "
                             << input->status() << endl;
    		    }
                    if (headersize < 0) finished = true;
                }
            } else {
                finished = true;
            }
            eIter = end.begin() + idx.depth();
        }
        if (!finished) {
            finished = !conf.indexMore();
        }
        es++;
    }
    idx.setEndAnalyzer(0);
    if (input) {
        // make sure the entire stream is read if the size is not known
        bool ready;
        tIter = through.begin() + idx.depth();
        uint32_t skipsize = 4096;
        do {
            // ask the analyzerconfiguration if we should continue
            int64_t max = idx.config().maximalStreamReadLength(idx);
            if (!idx.config().indexMore()
                    || (max != -1 && input->position() >= max)) {
                // we are done
                return 0;
            }
            ready = input->size() != -1;
            vector<StreamThroughAnalyzer*>::iterator ts;
            for (ts = tIter->begin(); ready && ts != tIter->end(); ++ts) {
                ready = (*ts)->isReadyWithStream();
            }
            if (!ready) {
                input->skip(skipsize);
                if (skipsize < 131072) {
                    skipsize *= 4;
                }
            }
        } while (!ready && input->status() == Ok);
        if (input->status() == Error) {
            fprintf(stderr, "Error: %s\n", input->error());
            removeIndexable(idx.depth());
            return -2;
        }
    }

    // store the size of the stream
    if (input && input->status() != Error && input->size() >= 0) {
        // TODO remove cast
        idx.addValue(sizefield, (uint32_t)input->size());
    }

    // remove references to the analysisresult before it goes out of scope
    removeIndexable(idx.depth());
    return 0;
}
signed char
IFilterEndAnalyzer::analyze(AnalysisResult& idx, InputStream *in) {
    const string& filename = idx.fileName();
    int p = filename.find_last_of('.');
    if (p < 0 ||  extensions.find(filename.substr(p)) == extensions.end()) {
        return -1;
    }

    string filepath;
    bool fileisondisk = checkForFile(idx.depth(), filename);
    if (fileisondisk) {
        filepath = filename;
    } else {
        int p = filename.find_last_of(".");
        if ( p > 0 ){
            string ext = filename.substr(p).c_str();
            strlwr((char*)ext.c_str());
            p = ext.find_first_not_of("._abcdefghijklmnopqrstuvwxyz0123456789");
            if ( p >= 0 )
                filepath = writeToTempFile(in, "");
            else
                filepath = writeToTempFile(in, ext.c_str());
        }else
            filepath = writeToTempFile(in, "");

    }

    if (filepath.length() > 0) {

        IFilter* filter = NULL;
        void* pvfilter=NULL;

        wchar_t tmp[MAX_PATH];
        _cpycharToWide(tmp,filepath.c_str(),MAX_PATH);
        HRESULT hr = LoadIFilter(tmp,NULL,&pvfilter);
        if (hr == S_OK) {
            filter = (IFilter*)pvfilter;

            ULONG __i=0;
            hr = filter->Init(IFILTER_INIT_APPLY_INDEX_ATTRIBUTES,0,NULL,&__i);
            if (FAILED( hr )) {
                if (!fileisondisk)
                    unlink(filepath.c_str());
                return -1;
            }

            const int sbBufferLen = 1024;
            wchar_t sbBuffer[sbBufferLen];

            STAT_CHUNK ps;
            hr = filter->GetChunk(&ps);
            while ( SUCCEEDED(hr) ) {
                if (ps.flags == CHUNK_TEXT) {
                    int resultText = 0;

                    while ( resultText >= 0 ) {
                        ULONG sizeBuffer=sbBufferLen;
                        resultText = filter->GetText(&sizeBuffer, sbBuffer);
                        if (sizeBuffer > 0 ) {
                            string str = wchartoutf8(sbBuffer,sbBuffer+sizeBuffer);
                            idx.addText(str.c_str(),str.length());
                        }
                    }
                } else if ( ps.flags == CHUNK_VALUE ) {
                    PROPVARIANT *pVar;
                    while ( SUCCEEDED( hr = filter->GetValue( &pVar ) ) ) {
                        //printf("propid: %d\nkind:%d\n",ps.attribute.psProperty.propid,ps.attribute.psProperty.ulKind);
                        if ( ps.attribute.psProperty.propid == 2 &&
                             ps.attribute.psProperty.ulKind == 1 &&
                             pVar->vt == VT_LPWSTR ) {

                            string str = wchartoutf8(pVar->pwszVal,pVar->pwszVal+wcslen(pVar->pwszVal));
                            idx.addValue("title", str );
                        }
                        PropVariantClear( pVar );
                        CoTaskMemFree( pVar );
                    }
                } else {
                    printf("other flag %d\n",ps.flags);
                }
                hr = filter->GetChunk(&ps);
            }
            filter->Release();
            if (!fileisondisk)
                unlink(filepath.c_str());
            return 0;
        }


        DWORD dw = GetLastError();
        if ( dw != 0 ) {
            LPVOID lpMsgBuf;
            FormatMessage(
                FORMAT_MESSAGE_ALLOCATE_BUFFER |
                FORMAT_MESSAGE_FROM_SYSTEM,
                NULL,
                dw,
                MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
                (LPTSTR) &lpMsgBuf,
                0, NULL );

            wprintf(L"%s\n", lpMsgBuf);
            LocalFree(lpMsgBuf);
        }
    }
    if (!fileisondisk && filepath.length()>0) {
        unlink(filepath.c_str());
    }
    return -1;
}