Esempio n. 1
0
// @pymethod |PyIFilter|GetText|Description of GetText.
PyObject *PyIFilter::GetText(PyObject *self, PyObject *args)
{
	IFilter *pIF = GetI(self);
	if ( pIF == NULL )
		return NULL;

	// @pyparm <int>|nBufSize|size of text buffer to create
	ULONG nBufSize = 0; 
	if ( !PyArg_ParseTuple(args, "|i:GetText", &nBufSize) )
		return NULL;

	HRESULT hr;
	if (nBufSize == 0)
		nBufSize = 8192; // 8k default

	WCHAR *wBuffer = (WCHAR *)PyMem_Malloc((nBufSize+1)*sizeof(WCHAR));
	if (!wBuffer){
		PyErr_SetString(PyExc_MemoryError, "getting text");
		return NULL;
	}

	PY_INTERFACE_PRECALL;
	hr = pIF->GetText( &nBufSize, wBuffer );
	PY_INTERFACE_POSTCALL;

	if ( FAILED(hr) ) {
		PyMem_Free(wBuffer);
		return PyCom_BuildPyException(hr, pIF, IID_IFilter );
	}

	PyObject *obRet =  PyWinObject_FromWCHAR(wBuffer, nBufSize);
	PyMem_Free(wBuffer);
	return obRet;
}
HRESULT Analyze(wchar_t* szPath) {
    HRESULT hr = S_OK;

    // Load the IFilter associated with the specified file
    IFilter* pFilter;
    hr = LoadIFilter(szPath, NULL, (void**)&pFilter);
    if (SUCCEEDED(hr)) {

        // Initialize the IFilter
        DWORD dwFlags = 0;
        hr = pFilter->Init(FILTER_INIT_OPTIONS,0,NULL,&dwFlags);
        if (SUCCEEDED(hr)) {
            wchar_t szBuffer[BUFLEN];
            ULONG ulSize;
            STAT_CHUNK ps;
            while (SUCCEEDED(hr))
            {

                // Retrieve the next chunk in the document
                hr = pFilter->GetChunk(&ps);
                if ( (FILTER_E_EMBEDDING_UNAVAILABLE == hr) || (FILTER_E_LINK_UNAVAILABLE == hr) ) {
                    hr = S_OK;
                    continue;
                } else if (FILTER_E_END_OF_CHUNKS == hr) {
                    hr = S_OK;
                    break;
                }
                while(SUCCEEDED(hr)) {

                    // Retrieve the next block of text in the current chunk 
                    ulSize = BUFLEN;
                    hr = pFilter->GetText(&ulSize, szBuffer);
                    if ( (FILTER_E_NO_TEXT == hr) || (FILTER_E_NO_MORE_TEXT  == hr) ) {
                        hr = S_OK;
                        break;
                    }
                    if (SUCCEEDED(hr) && (0 < ulSize)) {
                        szBuffer[ulSize] = '\0';

                        // Convert to UTF8
                        unsigned int cbMultiByte = WideCharToMultiByte(CP_UTF8, NULL, szBuffer, -1, NULL, 0, NULL, NULL);
                        if (0 == cbMultiByte) {
                            hr = E_FAIL;
							tcerr << "WideCharToMultiByte#1 invocation failed" << endl;
							errorMessagePrinted = true;
                        } else {
                            char* pchMultiByte = new char[cbMultiByte];
                            if (NULL == pchMultiByte) {
                                hr = E_OUTOFMEMORY;
                            } else {
                                if (0 == WideCharToMultiByte(CP_UTF8, NULL, szBuffer, -1, pchMultiByte, cbMultiByte, NULL, NULL)) {
                                    hr = E_FAIL;
									tcerr << "WideCharToMultiByte#2 invocation failed" << endl;
									errorMessagePrinted = true;
                                } else {

                                    // Write the UTF8 text to stdout
                                    if (cbMultiByte > fwrite(pchMultiByte, 1, cbMultiByte, stdout)) {
                                        hr = E_FAIL;
										tcerr << "Unable to write converted bytes to output" << endl;
										errorMessagePrinted = true;
                                    }
                                }
                                delete[] pchMultiByte;
                            }
                        }
                    }
                } 
            }
		} else {
			tcerr << "IFilter initialization failed with HRESULT " << hr << endl;
			errorMessagePrinted = true;
		}
        pFilter->Release(); 
	} else {
		tcerr << "IFilter loading failed with HRESULT " << hr << endl;
		errorMessagePrinted = true;
	}
    return hr;
}
signed char
IFilterEndAnalyzer::analyze(AnalysisResult& idx, InputStream *in) {
    const string& filename = idx.fileName();
    int p = filename.find_last_of('.');
    if (p < 0 ||  extensions.find(filename.substr(p)) == extensions.end()) {
        return -1;
    }

    string filepath;
    bool fileisondisk = checkForFile(idx.depth(), filename);
    if (fileisondisk) {
        filepath = filename;
    } else {
        int p = filename.find_last_of(".");
        if ( p > 0 ){
            string ext = filename.substr(p).c_str();
            strlwr((char*)ext.c_str());
            p = ext.find_first_not_of("._abcdefghijklmnopqrstuvwxyz0123456789");
            if ( p >= 0 )
                filepath = writeToTempFile(in, "");
            else
                filepath = writeToTempFile(in, ext.c_str());
        }else
            filepath = writeToTempFile(in, "");

    }

    if (filepath.length() > 0) {

        IFilter* filter = NULL;
        void* pvfilter=NULL;

        wchar_t tmp[MAX_PATH];
        _cpycharToWide(tmp,filepath.c_str(),MAX_PATH);
        HRESULT hr = LoadIFilter(tmp,NULL,&pvfilter);
        if (hr == S_OK) {
            filter = (IFilter*)pvfilter;

            ULONG __i=0;
            hr = filter->Init(IFILTER_INIT_APPLY_INDEX_ATTRIBUTES,0,NULL,&__i);
            if (FAILED( hr )) {
                if (!fileisondisk)
                    unlink(filepath.c_str());
                return -1;
            }

            const int sbBufferLen = 1024;
            wchar_t sbBuffer[sbBufferLen];

            STAT_CHUNK ps;
            hr = filter->GetChunk(&ps);
            while ( SUCCEEDED(hr) ) {
                if (ps.flags == CHUNK_TEXT) {
                    int resultText = 0;

                    while ( resultText >= 0 ) {
                        ULONG sizeBuffer=sbBufferLen;
                        resultText = filter->GetText(&sizeBuffer, sbBuffer);
                        if (sizeBuffer > 0 ) {
                            string str = wchartoutf8(sbBuffer,sbBuffer+sizeBuffer);
                            idx.addText(str.c_str(),str.length());
                        }
                    }
                } else if ( ps.flags == CHUNK_VALUE ) {
                    PROPVARIANT *pVar;
                    while ( SUCCEEDED( hr = filter->GetValue( &pVar ) ) ) {
                        //printf("propid: %d\nkind:%d\n",ps.attribute.psProperty.propid,ps.attribute.psProperty.ulKind);
                        if ( ps.attribute.psProperty.propid == 2 &&
                             ps.attribute.psProperty.ulKind == 1 &&
                             pVar->vt == VT_LPWSTR ) {

                            string str = wchartoutf8(pVar->pwszVal,pVar->pwszVal+wcslen(pVar->pwszVal));
                            idx.addValue("title", str );
                        }
                        PropVariantClear( pVar );
                        CoTaskMemFree( pVar );
                    }
                } else {
                    printf("other flag %d\n",ps.flags);
                }
                hr = filter->GetChunk(&ps);
            }
            filter->Release();
            if (!fileisondisk)
                unlink(filepath.c_str());
            return 0;
        }


        DWORD dw = GetLastError();
        if ( dw != 0 ) {
            LPVOID lpMsgBuf;
            FormatMessage(
                FORMAT_MESSAGE_ALLOCATE_BUFFER |
                FORMAT_MESSAGE_FROM_SYSTEM,
                NULL,
                dw,
                MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
                (LPTSTR) &lpMsgBuf,
                0, NULL );

            wprintf(L"%s\n", lpMsgBuf);
            LocalFree(lpMsgBuf);
        }
    }
    if (!fileisondisk && filepath.length()>0) {
        unlink(filepath.c_str());
    }
    return -1;
}