bool CISpellAffixRule::Apply ( CSphString & sWord ) { if ( m_sCondition.IsEmpty () ) return true; if ( sWord.IsEmpty () ) return false; m_iWordLen = strlen ( sWord.cstr () ); bool bDotCond = ( m_sCondition=="." ); if ( m_eRule==RULE_SUFFIXES ) { if ( !bDotCond && !CheckSuffix ( sWord ) ) return false; if ( !StripAppendSuffix ( sWord ) ) return false; } else { if ( !bDotCond && !CheckPrefix ( sWord ) ) return false; if ( !StripAppendPrefix ( sWord ) ) return false; } return true; }
void StripStdin ( const char * sIndexAttrs, const char * sRemoveElements ) { CSphString sError; CSphHTMLStripper tStripper ( true ); if ( !tStripper.SetIndexedAttrs ( sIndexAttrs, sError ) || !tStripper.SetRemovedElements ( sRemoveElements, sError ) ) sphDie ( "failed to configure stripper: %s", sError.cstr() ); CSphVector<BYTE> dBuffer; while ( !feof(stdin) ) { char sBuffer[1024]; int iLen = fread ( sBuffer, 1, sizeof(sBuffer), stdin ); if ( !iLen ) break; int iPos = dBuffer.GetLength(); dBuffer.Resize ( iPos+iLen ); memcpy ( &dBuffer[iPos], sBuffer, iLen ); } dBuffer.Add ( 0 ); tStripper.Strip ( &dBuffer[0] ); fprintf ( stdout, "dumping stripped results...\n%s\n", &dBuffer[0] ); }
void ExcerptGen_c::AddJunk ( int iStart, int iLength ) { int iChunkStart = iStart; for ( int i = iStart; i < iStart+iLength; i++ ) if ( sphIsSpace ( m_sBuffer.cstr () [i] ) != sphIsSpace ( m_sBuffer.cstr () [iChunkStart] ) ) { m_dTokens.Resize ( m_dTokens.GetLength () + 1 ); Token_t & tLast = m_dTokens.Last (); tLast.m_eType = TOK_SPACE; tLast.m_iStart = iChunkStart; tLast.m_iLengthBytes = i - iChunkStart; tLast.m_iWordID = 0; tLast.m_uWords = 0; iChunkStart = i; } m_dTokens.Resize ( m_dTokens.GetLength () + 1 ); Token_t & tLast = m_dTokens.Last (); tLast.m_eType = TOK_SPACE; tLast.m_iStart = iChunkStart; tLast.m_iLengthBytes = iStart + iLength - iChunkStart; tLast.m_iWordID = 0; tLast.m_uWords = 0; }
void SetupIndexing ( CSphSource_MySQL * pSrc, const CSphSourceParams_MySQL & tParams ) { CSphString sError; if ( !pSrc->Setup ( tParams ) ) sphDie ( "setup failed" ); if ( !pSrc->Connect ( sError ) ) sphDie ( "connect failed: %s", sError.cstr() ); if ( !pSrc->IterateStart ( sError ) ) sphDie ( "iterate-start failed: %s", sError.cstr() ); }
bool sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings & tSettings, CSphString & sError ) { // charset_type CSphScopedPtr<ISphTokenizer> pTokenizer ( NULL ); if ( !hIndex("charset_type") || hIndex["charset_type"]=="sbcs" ) { tSettings.m_iType = TOKENIZER_SBCS; } else if ( hIndex["charset_type"]=="utf-8" ) { tSettings.m_iType = hIndex("ngram_chars") ? TOKENIZER_NGRAM : TOKENIZER_UTF8; } else { sError.SetSprintf ( "unknown charset type '%s'", hIndex["charset_type"].cstr() ); return false; } tSettings.m_sCaseFolding = hIndex.GetStr ( "charset_table" ); tSettings.m_iMinWordLen = Max ( hIndex.GetInt ( "min_word_len" ), 0 ); tSettings.m_sNgramChars = hIndex.GetStr ( "ngram_chars" ); tSettings.m_iNgramLen = Max ( hIndex.GetInt ( "ngram_len" ), 0 ); tSettings.m_sSynonymsFile = hIndex.GetStr ( "exceptions" ); // new option name if ( tSettings.m_sSynonymsFile.IsEmpty() ) tSettings.m_sSynonymsFile = hIndex.GetStr ( "synonyms" ); // deprecated option name tSettings.m_sIgnoreChars = hIndex.GetStr ( "ignore_chars" ); // phrase boundaries int iBoundaryStep = Max ( hIndex.GetInt ( "phrase_boundary_step" ), -1 ); if ( iBoundaryStep!=0 ) tSettings.m_sBoundary = hIndex.GetStr ( "phrase_boundary" ); return true; }
bool sphPluginDrop ( PluginType_e eType, const char * sName, CSphString & sError ) { #if !HAVE_DLOPEN sError = "no dlopen(), no plugins"; return false; #else CSphScopedLock<CSphMutex> tLock ( g_tPluginMutex ); PluginKey_t tKey ( eType, sName ); PluginDesc_c ** ppPlugin = g_hPlugins(tKey); if ( !ppPlugin || !*ppPlugin ) { sError.SetSprintf ( "plugin '%s' does not exist", sName ); return false; } PluginDesc_c * pPlugin = *ppPlugin; PluginLib_c * pLib = pPlugin->GetLib(); Verify ( g_hPlugins.Delete(tKey) ); pPlugin->Release(); if ( --pLib->m_iHashedPlugins==0 ) { g_hPluginLibs.Delete ( pLib->GetName() ); pLib->Release(); } return true; #endif // HAVE_DLOPEN }
bool sphPluginParseSpec ( const CSphString & sParams, CSphVector<CSphString> & dParams, CSphString & sError ) { dParams.Resize ( 0 ); sphSplit ( dParams, sParams.cstr(), ":" ); switch ( dParams.GetLength() ) { case 0: return true; case 1: sError = "filter name required in spec string; example: \"plugins.so:myfilter\""; return false; case 2: dParams.Add ( "" ); return true; case 3: return true; } sError = "too many parts in spec string; must be in \"plugins.so:myfilter:options\" format"; return false; }
bool CISpellAffixRule::CheckSuffix ( const CSphString & sWord ) const { int iCondI = m_iCondLen-1; for ( int i=m_iWordLen-1; iCondI>=0 && i>=0; --i ) { if ( m_sCondition.cstr()[iCondI]=='.' ) { --iCondI; } else if ( m_sCondition.cstr()[iCondI]!=']' ) { if ( m_sCondition.cstr()[iCondI]!=sWord.cstr()[i] ) return false; --iCondI; } else { int iRangeStart = -1; for ( int j=iCondI; j>=0 && iRangeStart==-1; --j ) if ( m_sCondition.cstr()[j]=='[' ) iRangeStart = j; if ( iRangeStart==-1 ) return false; else { if ( !IsInSet ( sWord.cstr () [i], m_sCondition.SubString ( iRangeStart + 1, iCondI - iRangeStart - 1 ).cstr () ) ) return false; iCondI = iRangeStart - 1; } } } return true; }
void ExcerptGen_c::AddJunk ( int iStart, int iLength, int iBoundary ) { int iChunkStart = iStart; int iSaved = 0; for ( int i = iStart; i < iStart+iLength; i++ ){ const char* buf_ptr = NULL; if(m_bUtf8){ buf_ptr = m_sBufferUTF8.cstr (); }else{ buf_ptr = m_sBuffer.cstr (); } if ( sphIsSpace ( buf_ptr[i] ) != sphIsSpace ( buf_ptr[iChunkStart] ) ) { m_dTokens.Resize ( m_dTokens.GetLength () + 1 ); Token_t & tLast = m_dTokens.Last (); tLast.m_eType = TOK_SPACE; tLast.m_iStart = iChunkStart; tLast.m_iLengthBytes = i - iChunkStart; tLast.m_iWordID = 0; tLast.m_uWords = 0; iChunkStart = i; iSaved += tLast.m_iLengthBytes; if ( iBoundary != -1 && iSaved > iBoundary - iStart ) { AddBoundary(); iBoundary = -1; } } } m_dTokens.Resize ( m_dTokens.GetLength () + 1 ); Token_t & tLast = m_dTokens.Last (); tLast.m_eType = TOK_SPACE; tLast.m_iStart = iChunkStart; tLast.m_iLengthBytes = iStart + iLength - iChunkStart; tLast.m_iWordID = 0; tLast.m_uWords = 0; if ( iBoundary != -1 ) AddBoundary(); }
void ExcerptGen_c::AddJunk ( int iStart, int iLength, int iBoundary ) { assert ( iLength>0 ); assert ( iLength<=m_sBuffer.Length() ); assert ( iStart+iLength<=m_sBuffer.Length() ); int iChunkStart = iStart; int iSaved = 0; for ( int i = iStart; i < iStart+iLength; i++ ) if ( sphIsSpace ( m_sBuffer.cstr () [i] )!=sphIsSpace ( m_sBuffer.cstr () [iChunkStart] ) ) { Token_t & tLast = m_dTokens.Add(); tLast.m_eType = TOK_SPACE; tLast.m_iStart = iChunkStart; tLast.m_iLengthBytes = i - iChunkStart; tLast.m_iWordID = 0; tLast.m_uWords = 0; tLast.m_uPosition = 0; iChunkStart = i; iSaved += tLast.m_iLengthBytes; if ( iBoundary!=-1 && iSaved > ( iBoundary-iStart ) ) { AddBoundary(); iBoundary = -1; } } Token_t & tLast = m_dTokens.Add(); tLast.m_eType = TOK_SPACE; tLast.m_iStart = iChunkStart; tLast.m_iLengthBytes = iStart + iLength - iChunkStart; tLast.m_iWordID = 0; tLast.m_uWords = 0; tLast.m_uPosition = 0; if ( iBoundary!=-1 ) AddBoundary(); }
bool CISpellAffixRule::CheckPrefix ( const CSphString & sWord ) const { int iCondI = 0; for ( int i = 0; iCondI < m_iCondLen && i < m_iWordLen; ++i ) { if ( m_sCondition.cstr()[iCondI]!='[' ) { if ( m_sCondition.cstr()[iCondI]!=sWord.cstr()[i] ) return false; ++iCondI; } else { int iRangeEnd = -1; for ( int j=iCondI; j<m_iCondLen && iRangeEnd==-1; ++j ) if ( m_sCondition.cstr()[j]==']' ) iRangeEnd = j; if ( iRangeEnd==-1 ) return false; else { if ( !IsInSet ( sWord.cstr () [i], m_sCondition.SubString ( iCondI + 1, iRangeEnd - iCondI - 1 ).cstr () ) ) return false; } } } return true; }
static bool PluginLoadSymbols ( void * pDesc, const SymbolDesc_t * pSymbol, void * pHandle, const char * sName, CSphString & sError ) { #if !HAVE_DLOPEN sError = "no dlopen(), no plugins"; return false; #else CSphString s; while ( pSymbol->m_iOffsetOf>=0 ) { s.SetSprintf ( pSymbol->m_sPostfix[0] ? "%s_%s" : "%s%s", sName, pSymbol->m_sPostfix ); void ** ppFunc = (void**)((BYTE*)pDesc + pSymbol->m_iOffsetOf); *ppFunc = dlsym ( pHandle, s.cstr() ); if ( !*ppFunc && pSymbol->m_bRequired ) { sError.SetSprintf ( "symbol %s() not found", s.cstr() ); return false; } pSymbol++; } return true; #endif // HAVE_DLOPEN }
bool CISpellAffixRule::StripAppendPrefix ( CSphString & sWord ) const { static char szTmp [MAX_STR_LENGTH]; if ( !m_sStrip.IsEmpty () ) { const char * Pos = strstr ( sWord.cstr (), m_sStrip.cstr () ); if ( Pos!=sWord.cstr() ) return false; } if ( !m_sAppend.IsEmpty () ) strcpy ( szTmp, m_sAppend.cstr() ); // NOLINT strncpy ( szTmp + m_iAppendLen, sWord.cstr () + m_iStripLen, m_iWordLen - m_iStripLen ); szTmp [m_iWordLen - m_iStripLen + m_iAppendLen] = '\0'; sWord = szTmp; return true; }
bool CISpellAffixRule::StripAppendSuffix ( CSphString & sWord ) const { static char szTmp [ MAX_STR_LENGTH]; if ( !m_sStrip.IsEmpty () ) { if ( m_iWordLen < m_iStripLen ) return false; if ( strncmp ( sWord.cstr () + m_iWordLen - m_iStripLen, m_sStrip.cstr (), m_iStripLen ) ) return false; } strncpy ( szTmp, sWord.cstr (), m_iWordLen - m_iStripLen ); szTmp [m_iWordLen - m_iStripLen] = '\0'; if ( !m_sAppend.IsEmpty () ) strcat ( szTmp, m_sAppend.cstr () ); // NOLINT sWord = szTmp; return true; }
static PluginLib_c * LoadPluginLibrary ( const char * sLibName, CSphString & sError, bool bLinuxReload=false ) { CSphString sTmpfile; CSphString sLibfile; sLibfile.SetSprintf ( "%s/%s", g_sPluginDir.cstr(), sLibName ); // dlopen caches the old file content, even if file was updated // let's reload library from the temporary file to invalidate the cache if ( bLinuxReload ) { sTmpfile.SetSprintf ( "%s/%s.%u", g_sPluginDir.cstr(), sLibName, sphRand() ); if ( ::rename ( sLibfile.cstr(), sTmpfile.cstr() ) ) { sError.SetSprintf ( "failed to rename file (src=%s, dst=%s, errno=%d, error=%s)", sLibfile.cstr(), sTmpfile.cstr(), errno, strerror(errno) ); return NULL; } } void * pHandle = dlopen ( bLinuxReload ? sTmpfile.cstr() : sLibfile.cstr(), RTLD_LAZY | RTLD_LOCAL ); if ( !pHandle ) { const char * sDlerror = dlerror(); sError.SetSprintf ( "dlopen() failed: %s", sDlerror ? sDlerror : "(null)" ); return NULL; } sphLogDebug ( "dlopen(%s)=%p", bLinuxReload ? sTmpfile.cstr() : sLibfile.cstr(), pHandle ); // rename file back to the original name if ( bLinuxReload ) { if ( ::rename ( sTmpfile.cstr(), sLibfile.cstr() ) ) { sError.SetSprintf ( "failed to rename file (src=%s, dst=%s, errno=%d, error=%s)", sTmpfile.cstr(), sLibfile.cstr(), errno, strerror(errno) ); return NULL; } } CSphString sBasename = sLibName; const char * pDot = strchr ( sBasename.cstr(), '.' ); if ( pDot ) sBasename = sBasename.SubString ( 0, pDot-sBasename.cstr() ); CSphString sTmp; PluginVer_fn fnVer = (PluginVer_fn) dlsym ( pHandle, sTmp.SetSprintf ( "%s_ver", sBasename.cstr() ).cstr() ); if ( !fnVer ) { sError.SetSprintf ( "symbol '%s_ver' not found in '%s': update your UDF implementation", sBasename.cstr(), sLibName ); dlclose ( pHandle ); return NULL; } if ( fnVer() < SPH_UDF_VERSION ) { sError.SetSprintf ( "library '%s' was compiled using an older version of sphinxudf.h; it needs to be recompiled", sLibName ); dlclose ( pHandle ); return NULL; } return new PluginLib_c ( pHandle, sLibName ); }
bool sphPluginReload ( const char * sName, CSphString & sError ) { #if !HAVE_DLOPEN sError = "no dlopen(), no plugins"; return false; #else // find all plugins from the given library CSphScopedLock<CSphMutex> tLock ( g_tPluginMutex ); CSphVector<PluginKey_t> dKeys; CSphVector<PluginDesc_c*> dPlugins; g_hPlugins.IterateStart(); while ( g_hPlugins.IterateNext() ) { PluginDesc_c * v = g_hPlugins.IterateGet(); if ( v->GetLibName()==sName ) { dKeys.Add ( g_hPlugins.IterateGetKey() ); dPlugins.Add ( g_hPlugins.IterateGet() ); } } // no plugins loaded? oops if ( dPlugins.GetLength()==0 ) { sError.SetSprintf ( "no active plugins loaded from %s", sName ); return false; } // load new library and check every plugin #if !USE_WINDOWS PluginLib_c * pNewLib = LoadPluginLibrary ( sName, sError, true ); #else PluginLib_c * pNewLib = LoadPluginLibrary ( sName, sError ); #endif if ( !pNewLib ) return false; // load all plugins CSphVector<PluginDesc_c*> dNewPlugins; ARRAY_FOREACH ( i, dPlugins ) { PluginDesc_c * pDesc = NULL; const SymbolDesc_t * pSym = NULL; switch ( dKeys[i].m_eType ) { case PLUGIN_RANKER: pDesc = new PluginRanker_c ( pNewLib ); pSym = g_dSymbolsRanker; break; case PLUGIN_INDEX_TOKEN_FILTER: pDesc = new PluginTokenFilter_c ( pNewLib ); pSym = g_dSymbolsTokenFilter; break; case PLUGIN_QUERY_TOKEN_FILTER: pDesc = new PluginQueryTokenFilter_c ( pNewLib ); pSym = g_dSymbolsQueryTokenFilter; break; case PLUGIN_FUNCTION: pDesc = new PluginUDF_c ( pNewLib, dPlugins[i]->GetUdfRetType() ); pSym = g_dSymbolsUDF; break; default: sphDie ( "INTERNAL ERROR: unknown plugin type %d in sphPluginReload()", (int)dKeys[i].m_eType ); return false; } if ( !PluginLoadSymbols ( pDesc, pSym, pNewLib->GetHandle(), dKeys[i].m_sName.cstr(), sError ) ) { pDesc->Release(); break; } dNewPlugins.Add ( pDesc ); }
void DoIndexing ( CSphSource * pSrc, ISphRtIndex * pIndex ) { CSphString sError; CSphVector<DWORD> dMvas; int64_t tmStart = sphMicroTimer (); int64_t tmAvgCommit = 0; int64_t tmMaxCommit = 0; int iCommits = 0; for ( ;; ) { if ( !pSrc->IterateDocument ( sError ) ) sphDie ( "iterate-document failed: %s", sError.cstr() ); ISphHits * pHitsNext = pSrc->IterateHits ( sError ); if ( !sError.IsEmpty() ) sphDie ( "iterate-hits failed: %s", sError.cstr() ); if ( pSrc->m_tDocInfo.m_iDocID ) pIndex->AddDocument ( pHitsNext, pSrc->m_tDocInfo, NULL, dMvas, sError ); if ( ( pSrc->GetStats().m_iTotalDocuments % COMMIT_STEP )==0 || !pSrc->m_tDocInfo.m_iDocID ) { int64_t tmCommit = sphMicroTimer(); pIndex->Commit (); tmCommit = sphMicroTimer()-tmCommit; iCommits++; tmAvgCommit += tmCommit; tmMaxCommit = Max ( tmMaxCommit, tmCommit ); if ( !pSrc->m_tDocInfo.m_iDocID ) { tmAvgCommit /= iCommits; break; } } if (!( pSrc->GetStats().m_iTotalDocuments % 100 )) printf ( "%d docs\r", (int)pSrc->GetStats().m_iTotalDocuments ); static bool bOnce = true; if ( iCommits*COMMIT_STEP>=5000 && bOnce ) { printf ( "\n" ); DoSearch ( pIndex ); bOnce = false; } } pSrc->Disconnect(); int64_t tmEnd = sphMicroTimer (); float fTotalMB = (float)pSrc->GetStats().m_iTotalBytes/1000000.0f; printf ( "commit-step %d, %d docs, %d bytes, %d.%03d sec, %.2f MB/sec\n", COMMIT_STEP, (int)pSrc->GetStats().m_iTotalDocuments, (int)pSrc->GetStats().m_iTotalBytes, (int)((tmEnd-tmStart)/1000000), (int)(((tmEnd-tmStart)%1000000)/1000), fTotalMB*1000000.0f/(tmEnd-tmStart) ); printf ( "commit-docs %d, avg %d.%03d msec, max %d.%03d msec\n", COMMIT_STEP, (int)(tmAvgCommit/1000), (int)(tmAvgCommit%1000), (int)(tmMaxCommit/1000), (int)(tmMaxCommit%1000) ); g_fTotalMB += fTotalMB; }
int main () { // threads should be initialized before memory allocations char cTopOfMainStack; sphThreadInit(); MemorizeStack ( &cTopOfMainStack ); CSphString sError; CSphDictSettings tDictSettings; ISphTokenizer * pTok = sphCreateUTF8Tokenizer(); CSphDict * pDict = sphCreateDictionaryCRC ( tDictSettings, pTok, sError, "rt1" ); CSphSource * pSrc = SpawnSource ( "SELECT id, channel_id, UNIX_TIMESTAMP(published) published, title, UNCOMPRESS(content) content FROM posting WHERE id<=10000 AND id%2=0", pTok, pDict ); ISphTokenizer * pTok2 = sphCreateUTF8Tokenizer(); CSphDict * pDict2 = sphCreateDictionaryCRC ( tDictSettings, pTok, sError, "rt2" ); CSphSource * pSrc2 = SpawnSource ( "SELECT id, channel_id, UNIX_TIMESTAMP(published) published, title, UNCOMPRESS(content) content FROM posting WHERE id<=10000 AND id%2=1", pTok2, pDict2 ); CSphSchema tSrcSchema; if ( !pSrc->UpdateSchema ( &tSrcSchema, sError ) ) sphDie ( "update-schema failed: %s", sError.cstr() ); CSphSchema tSchema; // source schema must be all dynamic attrs; but index ones must be static tSchema.m_dFields = tSrcSchema.m_dFields; for ( int i=0; i<tSrcSchema.GetAttrsCount(); i++ ) tSchema.AddAttr ( tSrcSchema.GetAttr(i), false ); CSphConfigSection tRTConfig; sphRTInit(); sphRTConfigure ( tRTConfig, true ); SmallStringHash_T< CSphIndex * > dTemp; sphReplayBinlog ( dTemp, 0 ); ISphRtIndex * pIndex = sphCreateIndexRT ( tSchema, "testrt", 32*1024*1024, "data/dump", false ); pIndex->SetTokenizer ( pTok ); // index will own this pair from now on pIndex->SetDictionary ( pDict ); if ( !pIndex->Prealloc ( false, false, sError ) ) sphDie ( "prealloc failed: %s", pIndex->GetLastError().cstr() ); g_pIndex = pIndex; // initial indexing int64_t tmStart = sphMicroTimer(); SphThread_t t1, t2; sphThreadCreate ( &t1, IndexingThread, pSrc ); sphThreadCreate ( &t2, IndexingThread, pSrc2 ); sphThreadJoin ( &t1 ); sphThreadJoin ( &t2 ); #if 0 // update tParams.m_sQuery = "SELECT id, channel_id, UNIX_TIMESTAMP(published) published, title, UNCOMPRESS(content) content FROM rt2 WHERE id<=10000"; SetupIndexing ( pSrc, tParams ); DoIndexing ( pSrc, pIndex ); #endif // search DoSearch ( pIndex ); // shutdown index (should cause dump) int64_t tmShutdown = sphMicroTimer(); #if SPH_ALLOCS_PROFILER printf ( "pre-shutdown allocs=%d, bytes="INT64_FMT"\n", sphAllocsCount(), sphAllocBytes() ); #endif SafeDelete ( pIndex ); #if SPH_ALLOCS_PROFILER printf ( "post-shutdown allocs=%d, bytes="INT64_FMT"\n", sphAllocsCount(), sphAllocBytes() ); #endif int64_t tmEnd = sphMicroTimer(); printf ( "shutdown done in %d.%03d sec\n", (int)((tmEnd-tmShutdown)/1000000), (int)(((tmEnd-tmShutdown)%1000000)/1000) ); printf ( "total with shutdown %d.%03d sec, %.2f MB/sec\n", (int)((tmEnd-tmStart)/1000000), (int)(((tmEnd-tmStart)%1000000)/1000), g_fTotalMB*1000000.0f/(tmEnd-tmStart) ); #if SPH_DEBUG_LEAKS || SPH_ALLOCS_PROFILER sphAllocsStats(); #endif #if USE_WINDOWS PROCESS_MEMORY_COUNTERS pmc; HANDLE hProcess = OpenProcess ( PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, FALSE, GetCurrentProcessId() ); if ( hProcess && GetProcessMemoryInfo ( hProcess, &pmc, sizeof(pmc)) ) { printf ( "--- peak-wss=%d, peak-pagefile=%d\n", (int)pmc.PeakWorkingSetSize, (int)pmc.PeakPagefileUsage ); } #endif SafeDelete ( pIndex ); sphRTDone (); }
char * ExcerptGen_c::BuildExcerpt ( const ExcerptQuery_t & q, CSphDict * pDict, ISphTokenizer * pTokenizer ) { m_dTokens.Reserve ( 1024 ); m_sBuffer = q.m_sSource; const bool bUtf8 = pTokenizer->IsUtf8(); m_bUtf8 = bUtf8; // tokenize query words int iWordsLength = strlen ( q.m_sWords.cstr() ); CSphVector<char> dKwBuffer ( iWordsLength ); CSphVector<Keyword_t> dKeywords; dKeywords.Reserve ( MAX_HIGHLIGHT_WORDS ); BYTE * sWord; int iKwIndex = 0; pTokenizer->SetBuffer ( (BYTE*)q.m_sWords.cstr(), iWordsLength ); while ( ( sWord = pTokenizer->GetToken() ) != NULL ) { SphWordID_t iWord = pDict->GetWordID ( sWord ); if ( iWord ) { m_dWords.Resize ( m_dWords.GetLength () + 1 ); Token_t & tLast = m_dWords.Last (); tLast.m_eType = TOK_WORD; tLast.m_iWordID = iWord; tLast.m_iLengthBytes = strlen ( (const char *)sWord ); tLast.m_iLengthCP = bUtf8 ? sphUTF8Len ( (const char *)sWord ) : tLast.m_iLengthBytes; // store keyword dKeywords.Resize( dKeywords.GetLength() + 1 ); Keyword_t & kwLast = dKeywords.Last (); // find stars bool bStarBack = *pTokenizer->GetTokenEnd() == '*'; bool bStarFront = ( pTokenizer->GetTokenStart() != pTokenizer->GetBufferPtr() ) && pTokenizer->GetTokenStart()[-1] == '*'; kwLast.m_uStar = ( bStarFront ? STAR_FRONT : 0 ) | ( bStarBack ? STAR_BACK : 0 ); // store token const int iEndIndex = iKwIndex + tLast.m_iLengthBytes + 1; dKwBuffer.Resize ( iEndIndex ); kwLast.m_iWord = iKwIndex; strcpy ( &dKwBuffer [ iKwIndex ], (const char *)sWord ); iKwIndex = iEndIndex; if ( m_dWords.GetLength() == MAX_HIGHLIGHT_WORDS ) break; } } // tokenize document pTokenizer->SetBuffer ( (BYTE*)q.m_sSource.cstr (), strlen ( q.m_sSource.cstr () ) ); const char * pStartPtr = pTokenizer->GetBufferPtr (); const char * pLastTokenEnd = pStartPtr; //assign utf-8 m_sBufferUTF8 = pStartPtr; while ( ( sWord = pTokenizer->GetToken() ) != NULL ) { const char * pTokenStart = pTokenizer->GetTokenStart (); if ( pTokenStart != pStartPtr ) AddJunk ( pLastTokenEnd - pStartPtr, pTokenStart - pLastTokenEnd, pTokenizer->GetBoundary() ? pTokenizer->GetBoundaryOffset() : -1 ); SphWordID_t iWord = pDict->GetWordID ( sWord ); pLastTokenEnd = pTokenizer->GetTokenEnd (); m_dTokens.Resize ( m_dTokens.GetLength () + 1 ); Token_t & tLast = m_dTokens.Last (); tLast.m_eType = iWord ? TOK_WORD : TOK_SPACE; tLast.m_iStart = pTokenStart - pStartPtr; tLast.m_iLengthBytes = pLastTokenEnd - pTokenStart; tLast.m_iWordID = iWord; tLast.m_uWords = 0; // fill word mask if ( iWord ) { bool bMatch = false; int iOffset; ARRAY_FOREACH ( nWord, m_dWords ) { const char * keyword = &dKwBuffer [ dKeywords[nWord].m_iWord ]; const Token_t & token = m_dWords[nWord]; switch ( dKeywords[nWord].m_uStar ) { case STAR_NONE: bMatch = iWord == token.m_iWordID; break; case STAR_FRONT: iOffset = tLast.m_iLengthBytes - token.m_iLengthBytes; bMatch = (iOffset >= 0) && ( memcmp( keyword, sWord + iOffset, token.m_iLengthBytes ) == 0 ); break; case STAR_BACK: bMatch = ( tLast.m_iLengthBytes >= token.m_iLengthBytes ) && ( memcmp( keyword, sWord, token.m_iLengthBytes ) == 0 ); break; case STAR_BOTH: bMatch = strstr( (const char *)sWord, keyword ) != NULL; break; } if ( bMatch ) tLast.m_uWords |= (1UL << nWord); } } } // last space if any if ( pLastTokenEnd != pTokenizer->GetBufferEnd () ) { int iOffset = pTokenizer->GetBoundary() ? pTokenizer->GetBoundaryOffset() : -1; AddJunk ( pLastTokenEnd - pStartPtr, pTokenizer->GetBufferEnd () - pLastTokenEnd, iOffset ); } m_dTokens.Resize ( m_dTokens.GetLength () + 1 ); Token_t & tLast = m_dTokens.Last (); tLast.m_eType = TOK_NONE; tLast.m_iStart = 0; tLast.m_iLengthBytes = 0; tLast.m_iWordID = 0; tLast.m_uWords = 0; // sum token lengths int iSourceCodes = 0; ARRAY_FOREACH ( i, m_dTokens ) { m_dTokens [i].m_iWeight = 0; if ( m_dTokens [i].m_iLengthBytes ) { if ( bUtf8 ) { //int iLen = sphUTF8Len ( m_sBuffer.SubString ( m_dTokens[i].m_iStart, m_dTokens[i].m_iLengthBytes ).cstr() ); int iLen = sphUTF8Len ( m_sBufferUTF8.SubString ( m_dTokens[i].m_iStart, m_dTokens[i].m_iLengthBytes ).cstr() ); m_dTokens[i].m_iLengthCP = iLen; } else m_dTokens[i].m_iLengthCP = m_dTokens[i].m_iLengthBytes; iSourceCodes += m_dTokens[i].m_iLengthCP; } else m_dTokens [i].m_iLengthCP = 0; }
PluginKey_t ( PluginType_e eType, const char * sName ) : m_eType ( eType ) , m_sName ( sName ) { m_sName.ToLower(); }
void CISpellAffix::LoadLocale () { if ( m_bUseDictConversion ) printf ( "Using dictionary-defined character set\n" ); else if ( !m_sCharsetFile.IsEmpty () ) { FILE * pFile = fopen ( m_sCharsetFile.cstr (), "rt" ); if ( pFile ) { printf ( "Using charater set from '%s'\n", m_sCharsetFile.cstr () ); const int MAX_CHARSET_LENGTH = 4096; char szBuffer [MAX_CHARSET_LENGTH]; char * szResult = fgets ( szBuffer, MAX_CHARSET_LENGTH, pFile ); if ( szResult ) { CSphVector<CSphRemapRange> dRemaps; if ( sphParseCharset ( szBuffer, dRemaps ) ) { m_bUseLowerCaser = true; m_LowerCaser.AddRemaps ( dRemaps, 0 ); } else { printf ( "Failed to parse charset from '%s'\n", m_sCharsetFile.cstr() ); } } else { printf ( "Failed to read charset from '%s'\n", m_sCharsetFile.cstr() ); } fclose ( pFile ); } else { printf ( "Failed to open '%s'\n", m_sCharsetFile.cstr() ); } } else { if ( !m_sLocale.IsEmpty () ) { char dLocaleC[256], dLocaleUser[256]; setlocale ( LC_ALL, "C" ); for ( int i=0; i<256; i++ ) dLocaleC[i] = (char) tolower(i); char * szLocale = setlocale ( LC_CTYPE, m_sLocale.cstr() ); if ( szLocale ) { printf ( "Using user-defined locale (locale=%s)\n", m_sLocale.cstr() ); for ( int i=0; i<256; i++ ) dLocaleUser[i] = (char) tolower(i); if ( !memcmp ( dLocaleC, dLocaleUser, 256 ) ) printf ( "WARNING: user-defined locale provides the same case conversion as the default \"C\" locale\n" ); } else printf ( "WARNING: could not set user-defined locale for case conversions (locale=%s)\n", m_sLocale.cstr() ); } else printf ( "WARNING: no character set specified\n" ); } }
bool sphPluginCreate ( const char * szLib, PluginType_e eType, const char * sName, ESphAttr eUDFRetType, CSphString & sError ) { #if !HAVE_DLOPEN sError = "no dlopen(), no plugins"; return false; #else if ( !g_bPluginsEnabled ) { sError = "plugin support disabled (requires a valid plugin_dir)"; return false; } // validate library name for ( const char * p = szLib; *p; p++ ) if ( *p=='/' || *p=='\\' ) { sError = "restricted character (path delimiter) in a library file name"; return false; } CSphString sLib = szLib; sLib.ToLower(); // FIXME? preregister known rankers instead? if ( eType==PLUGIN_RANKER ) { for ( int i=0; i<SPH_RANK_TOTAL; i++ ) { const char * r = sphGetRankerName ( ESphRankMode(i) ); if ( r && strcasecmp ( sName, r )==0 ) { sError.SetSprintf ( "%s is a reserved ranker name", r ); return false; } } } // from here, we need a lock (we intend to update the plugin hash) CSphScopedLock<CSphMutex> tLock ( g_tPluginMutex ); // validate function name PluginKey_t k ( eType, sName ); if ( g_hPlugins(k) ) { sError.SetSprintf ( "plugin '%s' already exists", k.m_sName.cstr() ); return false; } // lookup or load library PluginLib_c * pLib = NULL; if ( g_hPluginLibs ( sLib ) ) { pLib = g_hPluginLibs [ sLib ]; pLib->AddRef(); } else { pLib = LoadPluginLibrary ( sLib.cstr(), sError ); if ( !pLib ) return false; } assert ( pLib->GetHandle() ); PluginDesc_c * pPlugin = NULL; const SymbolDesc_t * pSym = NULL; switch ( eType ) { case PLUGIN_RANKER: pPlugin = new PluginRanker_c ( pLib ); pSym = g_dSymbolsRanker; break; case PLUGIN_INDEX_TOKEN_FILTER: pPlugin = new PluginTokenFilter_c ( pLib ); pSym = g_dSymbolsTokenFilter; break; case PLUGIN_QUERY_TOKEN_FILTER: pPlugin = new PluginQueryTokenFilter_c ( pLib ); pSym = g_dSymbolsQueryTokenFilter; break; case PLUGIN_FUNCTION: pPlugin = new PluginUDF_c ( pLib, eUDFRetType ); pSym = g_dSymbolsUDF; break; default: sError.SetSprintf ( "INTERNAL ERROR: unknown plugin type %d in CreatePlugin()", (int)eType ); pLib->Release(); return false; } // release the refcount that this very function is holding // or in other words, transfer the refcount to newly created plugin instance (it does its own addref) pLib->Release(); if ( !PluginLoadSymbols ( pPlugin, pSym, pLib->GetHandle(), k.m_sName.cstr(), sError ) ) { sError.SetSprintf ( "%s in %s", sError.cstr(), sLib.cstr() ); pPlugin->Release(); return false; } // add library if needed if ( !g_hPluginLibs ( sLib ) ) { Verify ( g_hPluginLibs.Add ( pLib, pLib->GetName() ) ); pLib->AddRef(); // the hash reference } // add function Verify ( g_hPlugins.Add ( pPlugin, k ) ); pPlugin->GetLib()->m_iHashedPlugins++; return true; #endif // HAVE_DLOPEN }