bool
HelperEndAnalyzer::checkForFile(const AnalysisResult& idx) const {
    if (idx.depth() > 0) return false;
    struct stat s;
    if (stat(idx.path().c_str(), &s)) return false;
    return true;
}
示例#2
0
signed char
StreamAnalyzerPrivate::analyze(AnalysisResult& idx, StreamBase<char>* input) {
    //cerr << "analyze " << idx.path().c_str() << endl;

    // retrieve or construct the through analyzers and end analyzers
    vector<vector<StreamThroughAnalyzer*> >::iterator tIter;
    vector<vector<StreamEndAnalyzer*> >::iterator eIter;
    while ((int)through.size() <= idx.depth()) {
        addThroughAnalyzers();
        addEndAnalyzers();
    }
    tIter = through.begin() + idx.depth();
    eIter = end.begin() + idx.depth();

    // read the headersize size before connecting the throughanalyzers
    // This ensures that the first read is at least this size, even if the
    // throughanalyzers read smaller chunks.
    bool finished = false;
    const char* header = 0;
    int32_t headersize = 1024;
    if (input) {
        headersize = input->read(header, headersize, headersize);
        input->reset(0);
        if (headersize < 0) finished = true;
    }

    // insert the through analyzers
    vector<StreamThroughAnalyzer*>::iterator ts;
    for (ts = tIter->begin(); (input == 0 || input->status() == Ok)
            && ts != tIter->end(); ++ts) {
        (*ts)->setIndexable(&idx);
        input = (*ts)->connectInputStream(input);
        if (input && input->position() != 0) {
            cerr << "Analyzer " << (*ts)->name() << " has left the stream in a bad state." << endl;
        }
    }

    // reread the header so we can use it for the endanalyzers
    if (input && headersize > 0) {
        headersize = input->read(header, headersize, headersize);
        if (headersize <= 0) {
            finished = true;
        } else if (input->reset(0) != 0) {
            cerr << "resetting is impossible!! pos: " << input->position()
                << " status: " << input->status() << endl;
        }
    } else {
        // indicate that we have no data in the stream
        headersize = -1;
        finished = true;
    }
    size_t es = 0;
    size_t itersize = eIter->size();
    while (!finished && es != itersize) {
        StreamEndAnalyzer* sea = (*eIter)[es];
        if (sea->checkHeader(header, headersize)) {
            idx.setEndAnalyzer(sea);
            char ar = sea->analyze(idx, input);
            if (ar) {
// FIXME: find either a NIE-compliant way to report errors or use some API for this
//                idx.addValue(errorfield, sea->name() + string(": ")
//                    + sea->error());
                if (!idx.config().indexMore()) {
                    removeIndexable(idx.depth());
                    return -1;
                }
                int64_t pos = input->reset(0);
                if (pos != 0) { // could not reset
                    cerr << "could not reset stream of " << idx.path().c_str()
                        << " from pos " << input->position()
                        << " to 0 after reading with " << sea->name()
                        << ": " << sea->error().c_str() << endl;
                    finished = true;
                } else {
                    // refresh the pointer to the start of the data
                    headersize = input->read(header, headersize, headersize);
    		    if (input->reset(0) != 0) {
        		cerr << "resetting again is impossible!! pos: "
                             << input->position() << " status: "
                             << input->status() << endl;
    		    }
                    if (headersize < 0) finished = true;
                }
            } else {
                finished = true;
            }
            eIter = end.begin() + idx.depth();
        }
        if (!finished) {
            finished = !conf.indexMore();
        }
        es++;
    }
    idx.setEndAnalyzer(0);
    if (input) {
        // make sure the entire stream is read if the size is not known
        bool ready;
        tIter = through.begin() + idx.depth();
        uint32_t skipsize = 4096;
        do {
            // ask the analyzerconfiguration if we should continue
            int64_t max = idx.config().maximalStreamReadLength(idx);
            if (!idx.config().indexMore()
                    || (max != -1 && input->position() >= max)) {
                // we are done
                return 0;
            }
            ready = input->size() != -1;
            vector<StreamThroughAnalyzer*>::iterator ts;
            for (ts = tIter->begin(); ready && ts != tIter->end(); ++ts) {
                ready = (*ts)->isReadyWithStream();
            }
            if (!ready) {
                input->skip(skipsize);
                if (skipsize < 131072) {
                    skipsize *= 4;
                }
            }
        } while (!ready && input->status() == Ok);
        if (input->status() == Error) {
            fprintf(stderr, "Error: %s\n", input->error());
            removeIndexable(idx.depth());
            return -2;
        }
    }

    // store the size of the stream
    if (input && input->status() != Error && input->size() >= 0) {
        // TODO remove cast
        idx.addValue(sizefield, (uint32_t)input->size());
    }

    // remove references to the analysisresult before it goes out of scope
    removeIndexable(idx.depth());
    return 0;
}
signed char
IFilterEndAnalyzer::analyze(AnalysisResult& idx, InputStream *in) {
    const string& filename = idx.fileName();
    int p = filename.find_last_of('.');
    if (p < 0 ||  extensions.find(filename.substr(p)) == extensions.end()) {
        return -1;
    }

    string filepath;
    bool fileisondisk = checkForFile(idx.depth(), filename);
    if (fileisondisk) {
        filepath = filename;
    } else {
        int p = filename.find_last_of(".");
        if ( p > 0 ){
            string ext = filename.substr(p).c_str();
            strlwr((char*)ext.c_str());
            p = ext.find_first_not_of("._abcdefghijklmnopqrstuvwxyz0123456789");
            if ( p >= 0 )
                filepath = writeToTempFile(in, "");
            else
                filepath = writeToTempFile(in, ext.c_str());
        }else
            filepath = writeToTempFile(in, "");

    }

    if (filepath.length() > 0) {

        IFilter* filter = NULL;
        void* pvfilter=NULL;

        wchar_t tmp[MAX_PATH];
        _cpycharToWide(tmp,filepath.c_str(),MAX_PATH);
        HRESULT hr = LoadIFilter(tmp,NULL,&pvfilter);
        if (hr == S_OK) {
            filter = (IFilter*)pvfilter;

            ULONG __i=0;
            hr = filter->Init(IFILTER_INIT_APPLY_INDEX_ATTRIBUTES,0,NULL,&__i);
            if (FAILED( hr )) {
                if (!fileisondisk)
                    unlink(filepath.c_str());
                return -1;
            }

            const int sbBufferLen = 1024;
            wchar_t sbBuffer[sbBufferLen];

            STAT_CHUNK ps;
            hr = filter->GetChunk(&ps);
            while ( SUCCEEDED(hr) ) {
                if (ps.flags == CHUNK_TEXT) {
                    int resultText = 0;

                    while ( resultText >= 0 ) {
                        ULONG sizeBuffer=sbBufferLen;
                        resultText = filter->GetText(&sizeBuffer, sbBuffer);
                        if (sizeBuffer > 0 ) {
                            string str = wchartoutf8(sbBuffer,sbBuffer+sizeBuffer);
                            idx.addText(str.c_str(),str.length());
                        }
                    }
                } else if ( ps.flags == CHUNK_VALUE ) {
                    PROPVARIANT *pVar;
                    while ( SUCCEEDED( hr = filter->GetValue( &pVar ) ) ) {
                        //printf("propid: %d\nkind:%d\n",ps.attribute.psProperty.propid,ps.attribute.psProperty.ulKind);
                        if ( ps.attribute.psProperty.propid == 2 &&
                             ps.attribute.psProperty.ulKind == 1 &&
                             pVar->vt == VT_LPWSTR ) {

                            string str = wchartoutf8(pVar->pwszVal,pVar->pwszVal+wcslen(pVar->pwszVal));
                            idx.addValue("title", str );
                        }
                        PropVariantClear( pVar );
                        CoTaskMemFree( pVar );
                    }
                } else {
                    printf("other flag %d\n",ps.flags);
                }
                hr = filter->GetChunk(&ps);
            }
            filter->Release();
            if (!fileisondisk)
                unlink(filepath.c_str());
            return 0;
        }


        DWORD dw = GetLastError();
        if ( dw != 0 ) {
            LPVOID lpMsgBuf;
            FormatMessage(
                FORMAT_MESSAGE_ALLOCATE_BUFFER |
                FORMAT_MESSAGE_FROM_SYSTEM,
                NULL,
                dw,
                MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
                (LPTSTR) &lpMsgBuf,
                0, NULL );

            wprintf(L"%s\n", lpMsgBuf);
            LocalFree(lpMsgBuf);
        }
    }
    if (!fileisondisk && filepath.length()>0) {
        unlink(filepath.c_str());
    }
    return -1;
}