// @pymethod |PyIFilter|GetText|Description of GetText. PyObject *PyIFilter::GetText(PyObject *self, PyObject *args) { IFilter *pIF = GetI(self); if ( pIF == NULL ) return NULL; // @pyparm <int>|nBufSize|size of text buffer to create ULONG nBufSize = 0; if ( !PyArg_ParseTuple(args, "|i:GetText", &nBufSize) ) return NULL; HRESULT hr; if (nBufSize == 0) nBufSize = 8192; // 8k default WCHAR *wBuffer = (WCHAR *)PyMem_Malloc((nBufSize+1)*sizeof(WCHAR)); if (!wBuffer){ PyErr_SetString(PyExc_MemoryError, "getting text"); return NULL; } PY_INTERFACE_PRECALL; hr = pIF->GetText( &nBufSize, wBuffer ); PY_INTERFACE_POSTCALL; if ( FAILED(hr) ) { PyMem_Free(wBuffer); return PyCom_BuildPyException(hr, pIF, IID_IFilter ); } PyObject *obRet = PyWinObject_FromWCHAR(wBuffer, nBufSize); PyMem_Free(wBuffer); return obRet; }
HRESULT Analyze(wchar_t* szPath) { HRESULT hr = S_OK; // Load the IFilter associated with the specified file IFilter* pFilter; hr = LoadIFilter(szPath, NULL, (void**)&pFilter); if (SUCCEEDED(hr)) { // Initialize the IFilter DWORD dwFlags = 0; hr = pFilter->Init(FILTER_INIT_OPTIONS,0,NULL,&dwFlags); if (SUCCEEDED(hr)) { wchar_t szBuffer[BUFLEN]; ULONG ulSize; STAT_CHUNK ps; while (SUCCEEDED(hr)) { // Retrieve the next chunk in the document hr = pFilter->GetChunk(&ps); if ( (FILTER_E_EMBEDDING_UNAVAILABLE == hr) || (FILTER_E_LINK_UNAVAILABLE == hr) ) { hr = S_OK; continue; } else if (FILTER_E_END_OF_CHUNKS == hr) { hr = S_OK; break; } while(SUCCEEDED(hr)) { // Retrieve the next block of text in the current chunk ulSize = BUFLEN; hr = pFilter->GetText(&ulSize, szBuffer); if ( (FILTER_E_NO_TEXT == hr) || (FILTER_E_NO_MORE_TEXT == hr) ) { hr = S_OK; break; } if (SUCCEEDED(hr) && (0 < ulSize)) { szBuffer[ulSize] = '\0'; // Convert to UTF8 unsigned int cbMultiByte = WideCharToMultiByte(CP_UTF8, NULL, szBuffer, -1, NULL, 0, NULL, NULL); if (0 == cbMultiByte) { hr = E_FAIL; tcerr << "WideCharToMultiByte#1 invocation failed" << endl; errorMessagePrinted = true; } else { char* pchMultiByte = new char[cbMultiByte]; if (NULL == pchMultiByte) { hr = E_OUTOFMEMORY; } else { if (0 == WideCharToMultiByte(CP_UTF8, NULL, szBuffer, -1, pchMultiByte, cbMultiByte, NULL, NULL)) { hr = E_FAIL; tcerr << "WideCharToMultiByte#2 invocation failed" << endl; errorMessagePrinted = true; } else { // Write the UTF8 text to stdout if (cbMultiByte > fwrite(pchMultiByte, 1, cbMultiByte, stdout)) { hr = E_FAIL; tcerr << "Unable to write converted bytes to output" << endl; errorMessagePrinted = true; } } delete[] pchMultiByte; } } } } } } else { tcerr << "IFilter initialization failed with HRESULT " << hr << endl; errorMessagePrinted = true; } pFilter->Release(); } else { tcerr << "IFilter loading failed with HRESULT " << hr << endl; errorMessagePrinted = true; } return hr; }
signed char IFilterEndAnalyzer::analyze(AnalysisResult& idx, InputStream *in) { const string& filename = idx.fileName(); int p = filename.find_last_of('.'); if (p < 0 || extensions.find(filename.substr(p)) == extensions.end()) { return -1; } string filepath; bool fileisondisk = checkForFile(idx.depth(), filename); if (fileisondisk) { filepath = filename; } else { int p = filename.find_last_of("."); if ( p > 0 ){ string ext = filename.substr(p).c_str(); strlwr((char*)ext.c_str()); p = ext.find_first_not_of("._abcdefghijklmnopqrstuvwxyz0123456789"); if ( p >= 0 ) filepath = writeToTempFile(in, ""); else filepath = writeToTempFile(in, ext.c_str()); }else filepath = writeToTempFile(in, ""); } if (filepath.length() > 0) { IFilter* filter = NULL; void* pvfilter=NULL; wchar_t tmp[MAX_PATH]; _cpycharToWide(tmp,filepath.c_str(),MAX_PATH); HRESULT hr = LoadIFilter(tmp,NULL,&pvfilter); if (hr == S_OK) { filter = (IFilter*)pvfilter; ULONG __i=0; hr = filter->Init(IFILTER_INIT_APPLY_INDEX_ATTRIBUTES,0,NULL,&__i); if (FAILED( hr )) { if (!fileisondisk) unlink(filepath.c_str()); return -1; } const int sbBufferLen = 1024; wchar_t sbBuffer[sbBufferLen]; STAT_CHUNK ps; hr = filter->GetChunk(&ps); while ( SUCCEEDED(hr) ) { if (ps.flags == CHUNK_TEXT) { int resultText = 0; while ( resultText >= 0 ) { ULONG sizeBuffer=sbBufferLen; resultText = filter->GetText(&sizeBuffer, sbBuffer); if (sizeBuffer > 0 ) { string str = wchartoutf8(sbBuffer,sbBuffer+sizeBuffer); idx.addText(str.c_str(),str.length()); } } } else if ( ps.flags == CHUNK_VALUE ) { PROPVARIANT *pVar; while ( SUCCEEDED( hr = filter->GetValue( &pVar ) ) ) { //printf("propid: %d\nkind:%d\n",ps.attribute.psProperty.propid,ps.attribute.psProperty.ulKind); if ( ps.attribute.psProperty.propid == 2 && ps.attribute.psProperty.ulKind == 1 && pVar->vt == VT_LPWSTR ) { string str = wchartoutf8(pVar->pwszVal,pVar->pwszVal+wcslen(pVar->pwszVal)); idx.addValue("title", str ); } PropVariantClear( pVar ); CoTaskMemFree( pVar ); } } else { printf("other flag %d\n",ps.flags); } hr = filter->GetChunk(&ps); } filter->Release(); if (!fileisondisk) unlink(filepath.c_str()); return 0; } DWORD dw = GetLastError(); if ( dw != 0 ) { LPVOID lpMsgBuf; FormatMessage( FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM, NULL, dw, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPTSTR) &lpMsgBuf, 0, NULL ); wprintf(L"%s\n", lpMsgBuf); LocalFree(lpMsgBuf); } } if (!fileisondisk && filepath.length()>0) { unlink(filepath.c_str()); } return -1; }