void ExcerptGen_c::AddBoundary() { Token_t & tLast = m_dTokens.Add(); tLast.m_eType = TOK_BREAK; tLast.m_iStart = 0; tLast.m_iLengthBytes = 0; tLast.m_iWordID = 0; tLast.m_uWords = 0; tLast.m_uPosition = 0; }
void ExcerptGen_c::AddJunk ( int iStart, int iLength, int iBoundary ) { assert ( iLength>0 ); assert ( iLength<=m_sBuffer.Length() ); assert ( iStart+iLength<=m_sBuffer.Length() ); int iChunkStart = iStart; int iSaved = 0; for ( int i = iStart; i < iStart+iLength; i++ ) if ( sphIsSpace ( m_sBuffer.cstr () [i] )!=sphIsSpace ( m_sBuffer.cstr () [iChunkStart] ) ) { Token_t & tLast = m_dTokens.Add(); tLast.m_eType = TOK_SPACE; tLast.m_iStart = iChunkStart; tLast.m_iLengthBytes = i - iChunkStart; tLast.m_iWordID = 0; tLast.m_uWords = 0; tLast.m_uPosition = 0; iChunkStart = i; iSaved += tLast.m_iLengthBytes; if ( iBoundary!=-1 && iSaved > ( iBoundary-iStart ) ) { AddBoundary(); iBoundary = -1; } } Token_t & tLast = m_dTokens.Add(); tLast.m_eType = TOK_SPACE; tLast.m_iStart = iChunkStart; tLast.m_iLengthBytes = iStart + iLength - iChunkStart; tLast.m_iWordID = 0; tLast.m_uWords = 0; tLast.m_uPosition = 0; if ( iBoundary!=-1 ) AddBoundary(); }
void ExcerptGen_c::AddJunk ( int iStart, int iLength ) { int iChunkStart = iStart; for ( int i = iStart; i < iStart+iLength; i++ ) if ( sphIsSpace ( m_sBuffer.cstr () [i] ) != sphIsSpace ( m_sBuffer.cstr () [iChunkStart] ) ) { m_dTokens.Resize ( m_dTokens.GetLength () + 1 ); Token_t & tLast = m_dTokens.Last (); tLast.m_eType = TOK_SPACE; tLast.m_iStart = iChunkStart; tLast.m_iLengthBytes = i - iChunkStart; tLast.m_iWordID = 0; tLast.m_uWords = 0; iChunkStart = i; } m_dTokens.Resize ( m_dTokens.GetLength () + 1 ); Token_t & tLast = m_dTokens.Last (); tLast.m_eType = TOK_SPACE; tLast.m_iStart = iChunkStart; tLast.m_iLengthBytes = iStart + iLength - iChunkStart; tLast.m_iWordID = 0; tLast.m_uWords = 0; }
// copied over from sphinxutils; remove at some point void StrSplit ( CSphVector<CSphString> & dOut, const char * sIn ) { if ( !sIn ) return; const char * p = (char*)sIn; while ( *p ) { // skip non-alphas while ( (*p) && !IsAlpha(*p) ) p++; if ( !(*p) ) break; // this is my next token assert ( IsAlpha(*p) ); const char * sNext = p; while ( IsAlpha(*p) ) p++; if ( sNext!=p ) dOut.Add().SetBinary ( sNext, p-sNext ); } }
void ExcerptGen_c::AddJunk ( int iStart, int iLength, int iBoundary ) { int iChunkStart = iStart; int iSaved = 0; for ( int i = iStart; i < iStart+iLength; i++ ){ const char* buf_ptr = NULL; if(m_bUtf8){ buf_ptr = m_sBufferUTF8.cstr (); }else{ buf_ptr = m_sBuffer.cstr (); } if ( sphIsSpace ( buf_ptr[i] ) != sphIsSpace ( buf_ptr[iChunkStart] ) ) { m_dTokens.Resize ( m_dTokens.GetLength () + 1 ); Token_t & tLast = m_dTokens.Last (); tLast.m_eType = TOK_SPACE; tLast.m_iStart = iChunkStart; tLast.m_iLengthBytes = i - iChunkStart; tLast.m_iWordID = 0; tLast.m_uWords = 0; iChunkStart = i; iSaved += tLast.m_iLengthBytes; if ( iBoundary != -1 && iSaved > iBoundary - iStart ) { AddBoundary(); iBoundary = -1; } } } m_dTokens.Resize ( m_dTokens.GetLength () + 1 ); Token_t & tLast = m_dTokens.Last (); tLast.m_eType = TOK_SPACE; tLast.m_iStart = iChunkStart; tLast.m_iLengthBytes = iStart + iLength - iChunkStart; tLast.m_iWordID = 0; tLast.m_uWords = 0; if ( iBoundary != -1 ) AddBoundary(); }
char * ExcerptGen_c::BuildExcerpt ( const ExcerptQuery_t & q, CSphDict * pDict, ISphTokenizer * pTokenizer ) { m_dTokens.Reserve ( 1024 ); m_sBuffer = q.m_sSource; const bool bUtf8 = pTokenizer->IsUtf8(); m_bUtf8 = bUtf8; // tokenize query words int iWordsLength = strlen ( q.m_sWords.cstr() ); CSphVector<char> dKwBuffer ( iWordsLength ); CSphVector<Keyword_t> dKeywords; dKeywords.Reserve ( MAX_HIGHLIGHT_WORDS ); BYTE * sWord; int iKwIndex = 0; pTokenizer->SetBuffer ( (BYTE*)q.m_sWords.cstr(), iWordsLength ); while ( ( sWord = pTokenizer->GetToken() ) != NULL ) { SphWordID_t iWord = pDict->GetWordID ( sWord ); if ( iWord ) { m_dWords.Resize ( m_dWords.GetLength () + 1 ); Token_t & tLast = m_dWords.Last (); tLast.m_eType = TOK_WORD; tLast.m_iWordID = iWord; tLast.m_iLengthBytes = strlen ( (const char *)sWord ); tLast.m_iLengthCP = bUtf8 ? sphUTF8Len ( (const char *)sWord ) : tLast.m_iLengthBytes; // store keyword dKeywords.Resize( dKeywords.GetLength() + 1 ); Keyword_t & kwLast = dKeywords.Last (); // find stars bool bStarBack = *pTokenizer->GetTokenEnd() == '*'; bool bStarFront = ( pTokenizer->GetTokenStart() != pTokenizer->GetBufferPtr() ) && pTokenizer->GetTokenStart()[-1] == '*'; kwLast.m_uStar = ( bStarFront ? STAR_FRONT : 0 ) | ( bStarBack ? STAR_BACK : 0 ); // store token const int iEndIndex = iKwIndex + tLast.m_iLengthBytes + 1; dKwBuffer.Resize ( iEndIndex ); kwLast.m_iWord = iKwIndex; strcpy ( &dKwBuffer [ iKwIndex ], (const char *)sWord ); iKwIndex = iEndIndex; if ( m_dWords.GetLength() == MAX_HIGHLIGHT_WORDS ) break; } } // tokenize document pTokenizer->SetBuffer ( (BYTE*)q.m_sSource.cstr (), strlen ( q.m_sSource.cstr () ) ); const char * pStartPtr = pTokenizer->GetBufferPtr (); const char * pLastTokenEnd = pStartPtr; //assign utf-8 m_sBufferUTF8 = pStartPtr; while ( ( sWord = pTokenizer->GetToken() ) != NULL ) { const char * pTokenStart = pTokenizer->GetTokenStart (); if ( pTokenStart != pStartPtr ) AddJunk ( pLastTokenEnd - pStartPtr, pTokenStart - pLastTokenEnd, pTokenizer->GetBoundary() ? pTokenizer->GetBoundaryOffset() : -1 ); SphWordID_t iWord = pDict->GetWordID ( sWord ); pLastTokenEnd = pTokenizer->GetTokenEnd (); m_dTokens.Resize ( m_dTokens.GetLength () + 1 ); Token_t & tLast = m_dTokens.Last (); tLast.m_eType = iWord ? TOK_WORD : TOK_SPACE; tLast.m_iStart = pTokenStart - pStartPtr; tLast.m_iLengthBytes = pLastTokenEnd - pTokenStart; tLast.m_iWordID = iWord; tLast.m_uWords = 0; // fill word mask if ( iWord ) { bool bMatch = false; int iOffset; ARRAY_FOREACH ( nWord, m_dWords ) { const char * keyword = &dKwBuffer [ dKeywords[nWord].m_iWord ]; const Token_t & token = m_dWords[nWord]; switch ( dKeywords[nWord].m_uStar ) { case STAR_NONE: bMatch = iWord == token.m_iWordID; break; case STAR_FRONT: iOffset = tLast.m_iLengthBytes - token.m_iLengthBytes; bMatch = (iOffset >= 0) && ( memcmp( keyword, sWord + iOffset, token.m_iLengthBytes ) == 0 ); break; case STAR_BACK: bMatch = ( tLast.m_iLengthBytes >= token.m_iLengthBytes ) && ( memcmp( keyword, sWord, token.m_iLengthBytes ) == 0 ); break; case STAR_BOTH: bMatch = strstr( (const char *)sWord, keyword ) != NULL; break; } if ( bMatch ) tLast.m_uWords |= (1UL << nWord); } } } // last space if any if ( pLastTokenEnd != pTokenizer->GetBufferEnd () ) { int iOffset = pTokenizer->GetBoundary() ? pTokenizer->GetBoundaryOffset() : -1; AddJunk ( pLastTokenEnd - pStartPtr, pTokenizer->GetBufferEnd () - pLastTokenEnd, iOffset ); } m_dTokens.Resize ( m_dTokens.GetLength () + 1 ); Token_t & tLast = m_dTokens.Last (); tLast.m_eType = TOK_NONE; tLast.m_iStart = 0; tLast.m_iLengthBytes = 0; tLast.m_iWordID = 0; tLast.m_uWords = 0; // sum token lengths int iSourceCodes = 0; ARRAY_FOREACH ( i, m_dTokens ) { m_dTokens [i].m_iWeight = 0; if ( m_dTokens [i].m_iLengthBytes ) { if ( bUtf8 ) { //int iLen = sphUTF8Len ( m_sBuffer.SubString ( m_dTokens[i].m_iStart, m_dTokens[i].m_iLengthBytes ).cstr() ); int iLen = sphUTF8Len ( m_sBufferUTF8.SubString ( m_dTokens[i].m_iStart, m_dTokens[i].m_iLengthBytes ).cstr() ); m_dTokens[i].m_iLengthCP = iLen; } else m_dTokens[i].m_iLengthCP = m_dTokens[i].m_iLengthBytes; iSourceCodes += m_dTokens[i].m_iLengthCP; } else m_dTokens [i].m_iLengthCP = 0; }
bool CSphConfigParser::TryToExec ( char * pBuffer, char * pEnd, const char * szFilename, CSphVector<char> & dResult ) { int dPipe[2] = { -1, -1 }; if ( pipe ( dPipe ) ) { snprintf ( m_sError, sizeof ( m_sError ), "pipe() failed (error=%s)", strerror(errno) ); return false; } pBuffer = trim ( pBuffer ); int iRead = dPipe [0]; int iWrite = dPipe [1]; signal ( SIGCHLD, sigchld ); int iChild = fork(); if ( iChild == 0 ) { close ( iRead ); close ( STDOUT_FILENO ); dup2 ( iWrite, STDOUT_FILENO ); char * pPtr = pBuffer; char * pArgs = NULL; while ( *pPtr ) { if ( sphIsSpace ( *pPtr ) ) { *pPtr = '\0'; pArgs = trim ( pPtr+1 ); break; } pPtr++; } if ( pArgs ) execl ( pBuffer, pBuffer, pArgs, szFilename, NULL ); else execl ( pBuffer, pBuffer, szFilename, NULL ); exit ( 1 ); } else if ( iChild == -1 ) { snprintf ( m_sError, sizeof ( m_sError ), "fork failed (error=%s)", strerror(errno) ); return false; } close ( iWrite ); int iBytesRead, iTotalRead = 0; const int BUFFER_SIZE = 65536; dResult.Reset (); do { dResult.Resize ( iTotalRead + BUFFER_SIZE ); iBytesRead = read ( iRead, (void*)&(dResult [iTotalRead]), BUFFER_SIZE ); iTotalRead += iBytesRead; } while ( iBytesRead > 0 ); int iStatus; wait ( &iStatus ); iStatus = (signed char) WEXITSTATUS (iStatus); if ( iStatus ) { snprintf ( m_sError, sizeof ( m_sError ), "error executing '%s'", pBuffer ); return false; } if ( iBytesRead < 0 ) { snprintf ( m_sError, sizeof ( m_sError ), "pipe read error (error=%s)", strerror(errno) ); return false; } dResult.Resize ( iTotalRead + 1 ); dResult [iTotalRead] = '\0'; return true; }
void UrlBreak ( Split_t & tBest, const char * sWord ) { const int iLen = strlen(sWord); tBest.m_Pos.Resize(0); // current partial splits // begin with an empty one CSphVector<Split_t> dSplits; dSplits.Add(); // our best guess so far // begin with a trivial baseline one (ie. no splits at all) Prob_t p = g_LM.GetProb ( sWord, iLen ); tBest.m_Pos.Add ( iLen ); tBest.m_fProb = p.m_fProb; tBest.m_bAllDict = tBest.m_bAnyDict = p.m_bDict; if ( iLen>=DICT_COMPOUND_MIN && tBest.m_bAllDict ) { static const float THRESH = logf ( DICT_COMPOUND_THRESH ); if ( tBest.m_fProb<=THRESH ) tBest.m_fProb *= DICT_COMPOUND_COEFF; } // work the current splits CSphVector<Split_t> dSplits2; while ( dSplits.GetLength() ) { int iWorkedSplits = 0; float fPrevBest = tBest.m_fProb; ARRAY_FOREACH ( iSplit, dSplits ) { Split_t & s = dSplits[iSplit]; // filter out splits that were added before (!) a new best guess on the previous iteration if ( dSplits[iSplit] < tBest ) continue; iWorkedSplits++; int iLast = 0; if ( s.m_Pos.GetLength() ) iLast = s.m_Pos.Last(); for ( int i=1+iLast; i<iLen; i++ ) { // consider a split at position i // it generates a word candidate [iLast,i) and a tail [i,iLen) // let's score those Prob_t tCand = g_LM.GetProb ( sWord+iLast, i-iLast ); Prob_t tTail = g_LM.GetProb ( sWord+i, iLen-i ); // if the current best is all-keywords, the new candidates must be, too if ( tBest.m_bAllDict && !tCand.m_bDict ) continue; // compute partial and full split candidates generated by the current guess Split_t tPartial = s; tPartial.AddSplitPos ( tCand, i ); Split_t tFull = tPartial; tFull.AddSplitPos ( tTail, iLen ); // check if the full one is our new best full one bool bNewBest = false; if ( tBest < tFull ) { // FIXME? we do this even when the new split is *not* all-keywords, // but the old best split was; is this ever a problem? tBest = tFull; // tBest.Dump ( sWord, "new-best" ); bNewBest = true; } // check if the resulting partial split is worth scanning further if ( tBest < tPartial ) { dSplits2.Add ( tPartial ); // dSplits2.Last().Dump ( sWord, "scan-partial" ); } } } // damage control! // if we just processed over 100K candidate splits and got no improvement // lets assume that our chances of getting one are kinda low and bail if ( iWorkedSplits>=100000 && tBest.m_fProb>=fPrevBest ) break; // keep going dSplits.SwapData ( dSplits2 ); dSplits2.Resize ( 0 ); }
int main ( int iArgs, char ** dArgs ) { OutputMode_e eMode = M_DEFAULT; bool bUseCustomCharset = false; CSphString sDict, sAffix, sLocale, sCharsetFile, sResult = "result.txt"; printf ( "spelldump, an ispell dictionary dumper\n\n" ); int i = 1; for ( ; i < iArgs; i++ ) { if ( !strcmp ( dArgs[i], "-c" ) ) { if ( ++i==iArgs ) break; bUseCustomCharset = true; sCharsetFile = dArgs[i]; } else if ( !strcmp ( dArgs[i], "-m" ) ) { if ( ++i==iArgs ) break; char * sMode = dArgs[i]; if ( !strcmp ( sMode, "debug" ) ) { eMode = M_DEBUG; continue; } if ( !strcmp ( sMode, "duplicates" ) ) { eMode = M_DUPLICATES; continue; } if ( !strcmp ( sMode, "last" ) ) { eMode = M_LAST; continue; } if ( !strcmp ( sMode, "default" ) ) { eMode = M_DEFAULT; continue; } printf ( "Unrecognized mode: %s\n", sMode ); return 1; } else break; } switch ( iArgs - i ) { case 4: sLocale = dArgs[i + 3]; case 3: sResult = dArgs[i + 2]; case 2: sAffix = dArgs[i + 1]; sDict = dArgs[i]; break; default: printf ( "Usage: spelldump [options] <dictionary> <affix> [result] [locale-name]\n\n" "Options:\n" "-c <file>\tuse case convertion defined in <file>\n" "-m <mode>\toutput (conflict resolution) mode:\n" "\t\tdefault - try to guess the best way to resolve a conflict\n" "\t\tlast - choose last entry\n" "\t\tdebug - dump all mappings (with rules)\n" "\t\tduplicates - dump duplicate mappings only (with rules)\n" ); if ( iArgs>1 ) { printf ( "\n" "Examples:\n" "spelldump en.dict en.aff\n" "spelldump ru.dict ru.aff ru.txt ru_RU.CP1251\n" "spelldump ru.dict ru.aff ru.txt .1251\n" ); } return 1; } printf ( "Loading dictionary...\n" ); CISpellDict Dict; if ( !Dict.Load ( sDict.cstr () ) ) sphDie ( "Error loading dictionary file '%s'\n", sDict.IsEmpty () ? "" : sDict.cstr () ); printf ( "Loading affix file...\n" ); CISpellAffix Affix ( sLocale.cstr (), bUseCustomCharset ? sCharsetFile.cstr () : NULL ); if ( !Affix.Load ( sAffix.cstr () ) ) sphDie ( "Error loading affix file '%s'\n", sAffix.IsEmpty () ? "" : sAffix.cstr () ); if ( sResult.IsEmpty () ) sphDie ( "No result file specified\n" ); FILE * pFile = fopen ( sResult.cstr (), "wt" ); if ( !pFile ) sphDie ( "Unable to open '%s' for writing\n", sResult.cstr () ); if ( eMode!=M_DEFAULT ) printf ( "Output mode: %s\n", dModeName[eMode] ); Dict.IterateStart (); WordMap_t tWordMap; const CISpellDict::CISpellDictWord * pWord = NULL; int nDone = 0; while ( ( pWord = Dict.IterateNext () )!=NULL ) { EmitResult ( tWordMap, pWord->m_sWord, pWord->m_sWord ); if ( ( ++nDone % 10 )==0 ) { printf ( "\rDictionary words processed: %d", nDone ); fflush ( stdout ); } if ( pWord->m_sFlags.IsEmpty() ) continue; CSphString sWord, sWordForCross; int iFlagLen = strlen ( pWord->m_sFlags.cstr () ); for ( int iFlag1 = 0; iFlag1 < iFlagLen; ++iFlag1 ) for ( int iRule1 = 0; iRule1 < Affix.GetNumRules (); ++iRule1 ) { CISpellAffixRule * pRule1 = Affix.GetRule ( iRule1 ); if ( pRule1->Flag()!=pWord->m_sFlags.cstr()[iFlag1] ) continue; sWord = pWord->m_sWord; if ( !pRule1->Apply ( sWord ) ) continue; EmitResult ( tWordMap, sWord, pWord->m_sWord, pRule1->Flag() ); // apply other rules if ( !Affix.CheckCrosses() ) continue; if ( !pRule1->IsCrossProduct() ) continue; for ( int iFlag2 = iFlag1 + 1; iFlag2 < iFlagLen; ++iFlag2 ) for ( int iRule2 = 0; iRule2 < Affix.GetNumRules (); ++iRule2 ) { CISpellAffixRule * pRule2 = Affix.GetRule ( iRule2 ); if ( !pRule2->IsCrossProduct () || pRule2->Flag()!=pWord->m_sFlags.cstr()[iFlag2] || pRule2->IsPrefix()==pRule1->IsPrefix() ) continue; sWordForCross = sWord; if ( pRule2->Apply ( sWordForCross ) ) EmitResult ( tWordMap, sWordForCross, pWord->m_sWord, pRule1->Flag(), pRule2->Flag() ); } } } printf ( "\rDictionary words processed: %d\n", nDone ); // output CSphVector<const char *> dKeys; tWordMap.IterateStart(); while ( tWordMap.IterateNext() ) dKeys.Add ( tWordMap.IterateGetKey().cstr() ); dKeys.Sort ( WordLess() ); ARRAY_FOREACH ( iKey, dKeys ) { const CSphVector<MapInfo_t> & dWords = tWordMap[dKeys[iKey]]; const char * sKey = dKeys[iKey]; switch ( eMode ) { case M_LAST: fprintf ( pFile, "%s > %s\n", sKey, dWords.Last().m_sWord.cstr() ); break; case M_EXACT_OR_LONGEST: { int iMatch = 0; int iLength = 0; ARRAY_FOREACH ( i, dWords ) { if ( dWords[i].m_sWord==sKey ) { iMatch = i; break; } int iWordLength = strlen ( dWords[i].m_sWord.cstr() ); if ( iWordLength>iLength ) { iLength = iWordLength; iMatch = i; } } fprintf ( pFile, "%s > %s\n", sKey, dWords[iMatch].m_sWord.cstr() ); break; } case M_DUPLICATES: if ( dWords.GetLength()==1 ) break; case M_DEBUG: ARRAY_FOREACH ( i, dWords ) fprintf ( pFile, "%s > %s %s/%d\n", sKey, dWords[i].m_sWord.cstr(), dWords[i].m_sRules, dWords.GetLength() ); break; } } fclose ( pFile ); return 0; }
int CISpellAffix::GetNumRules () const { return m_dRules.GetLength (); }
bool CISpellAffix::LoadMySpell ( FILE * pFile ) { char sBuffer [MAX_STR_LENGTH]; char sCondition [MAX_STR_LENGTH]; char sRemove [MAX_STR_LENGTH]; char sAppend [MAX_STR_LENGTH]; RuleType_e eRule = RULE_NONE; BYTE cFlag = 0; BYTE cCombine = 0; int iCount = 0, iLine = 0; const char * sMode = 0; while ( !feof ( pFile ) ) { char * sLine = fgets ( sBuffer, MAX_STR_LENGTH, pFile ); if ( !sLine ) break; ++iLine; // prefix and suffix rules RuleType_e eNewRule = RULE_NONE; if ( !strncmp ( sLine, "PFX", 3 ) ) { eNewRule = RULE_PREFIXES; sMode = "prefix"; } else if ( !strncmp ( sLine, "SFX", 3 ) ) { eNewRule = RULE_SUFFIXES; sMode = "suffix"; } if ( eNewRule!=RULE_NONE ) { sLine += 3; while ( *sLine && isspace ( (unsigned char) *sLine ) ) ++sLine; if ( eNewRule!=eRule ) // new rule header { if ( iCount ) printf ( "WARNING: Line %d: Premature end of entries.\n", iLine ); if ( sscanf ( sLine, "%c %c %d", &cFlag, &cCombine, &iCount )!=3 ) // NOLINT printf ( "WARNING; Line %d: Malformed %s header\n", iLine, sMode ); eRule = eNewRule; } else // current rule continued { *sRemove = *sAppend = 0; char cNewFlag; if ( sscanf ( sLine, "%c %s %s %s", &cNewFlag, sRemove, sAppend, sCondition )==4 ) // NOLINT { if ( cNewFlag!=cFlag ) printf ( "WARNING: Line %d: Flag character mismatch\n", iLine ); if ( *sRemove=='0' && *(sRemove + 1)==0 ) *sRemove = 0; if ( *sAppend=='0' && *(sAppend + 1)==0 ) *sAppend = 0; CISpellAffixRule Rule ( eRule, cFlag, cCombine=='Y', sCondition, sRemove, sAppend ); m_dRules.Add ( Rule ); } else printf ( "WARNING: Line %d: Malformed %s rule\n", iLine, sMode ); if ( !--iCount ) eRule = RULE_NONE; } continue; } } return true; }
bool CISpellAffix::LoadISpell ( FILE * pFile ) { char szBuffer [ MAX_STR_LENGTH ]; char szCondition [ MAX_STR_LENGTH ]; char szStrip [ MAX_STR_LENGTH ]; char szAppend [ MAX_STR_LENGTH ]; RuleType_e eRule = RULE_NONE; char cFlag = '\0'; bool bCrossProduct = false; int iLine = 0; // TODO: parse all .aff character replacement commands while ( !feof ( pFile ) ) { char * szResult = fgets ( szBuffer, MAX_STR_LENGTH, pFile ); if ( !szResult ) break; iLine++; if ( !strncasecmp ( szBuffer, "prefixes", 8 ) ) { eRule = RULE_PREFIXES; continue; } if ( !strncasecmp ( szBuffer, "suffixes", 8 ) ) { eRule = RULE_SUFFIXES; continue; } if ( !strncasecmp ( szBuffer, "wordchars", 9 ) ) { char * szStart = szBuffer + 9; while ( *szStart && isspace ( (unsigned char) *szStart ) ) ++szStart; char * szRangeL = szStart; while ( *szStart && !isspace ( (unsigned char) *szStart ) ) ++szStart; if ( !*szStart ) { printf ( "WARNING: Line %d: invalid 'wordchars' statement\n", iLine ); continue; } *szStart = '\0'; ++szStart; while ( *szStart && isspace ( (unsigned char) *szStart ) ) ++szStart; char * szRangeU = szStart; while ( *szStart && !isspace ( (unsigned char) *szStart ) ) ++szStart; *szStart = '\0'; if ( !AddToCharset ( szRangeL, szRangeU ) ) printf ( "WARNING: Line %d: cannot add to charset: '%s' '%s'\n", iLine, szRangeL, szRangeU ); continue; } if ( !strncasecmp ( szBuffer, "flag", 4 ) ) { if ( eRule==RULE_NONE ) { printf ( "WARNING: Line %d: 'flag' appears before preffixes or suffixes\n", iLine ); continue; } char * szStart = szBuffer + 4; while ( *szStart && isspace ( (unsigned char) *szStart ) ) ++szStart; bCrossProduct = ( *szStart=='*' ); cFlag = bCrossProduct ? *(szStart + 1) : *(szStart); continue; } if ( eRule==RULE_NONE ) continue; char * szComment = strchr ( szBuffer, '#' ); if ( szComment ) *szComment = '\0'; if ( !* szBuffer ) continue; szCondition[0] = '\0'; szStrip[0] = '\0'; szAppend[0] = '\0'; int nFields = sscanf ( szBuffer, "%[^>\n]>%[^,\n],%[^\n]", szCondition, szStrip, szAppend ); // NOLINT Strip ( szCondition ); Strip ( szStrip ); Strip ( szAppend ); switch ( nFields ) { case 2: // no optional strip-string strcpy ( szAppend, szStrip ); // NOLINT szStrip[0] = '\0'; break; case 3: // all read break; default: // invalid repl continue; } CISpellAffixRule Rule ( eRule, cFlag, bCrossProduct, szCondition, szStrip, szAppend ); m_dRules.Add ( Rule ); } return true; }
void ExcerptGen_c::TokenizeDocument ( char * pData, CSphDict * pDict, ISphTokenizer * pTokenizer, bool bFillMasks, bool bRetainHtml ) { m_iDocumentWords = 0; m_dTokens.Reserve ( 1024 ); m_sBuffer = pData; pTokenizer->SetBuffer ( (BYTE*)pData, strlen(pData) ); const char * pStartPtr = pTokenizer->GetBufferPtr (); const char * pLastTokenEnd = pStartPtr; if ( bRetainHtml ) pTokenizer->AddSpecials ( "<" ); BYTE * sWord; DWORD uPosition = 0; // hit position in document while ( ( sWord = pTokenizer->GetToken() )!=NULL ) { if ( pTokenizer->TokenIsBlended() ) continue; const char * pTokenStart = pTokenizer->GetTokenStart (); if ( pTokenStart!=pStartPtr && pTokenStart>pLastTokenEnd ) AddJunk ( pLastTokenEnd - pStartPtr, pTokenStart - pLastTokenEnd, pTokenizer->GetBoundary() ? pTokenizer->GetBoundaryOffset() : -1 ); if ( bRetainHtml && *pTokenStart=='<' ) { int iTagEnd = FindTagEnd ( pTokenStart ); if ( iTagEnd!=-1 ) { assert ( pTokenStart+iTagEnd<pTokenizer->GetBufferEnd() ); AddJunk ( pTokenStart-pStartPtr, iTagEnd+1, pTokenizer->GetBoundary() ? pTokenizer->GetBoundaryOffset() : -1 ); pTokenizer->SetBufferPtr ( pTokenStart+iTagEnd+1 ); pLastTokenEnd = pTokenStart+iTagEnd+1; // fix it up to prevent adding last chunk on exit continue; } } SphWordID_t iWord = iWord = pDict->GetWordID ( sWord ); pLastTokenEnd = pTokenizer->GetTokenEnd (); if ( pTokenizer->GetBoundary() ) uPosition += 100; // FIXME: this should be taken from index settings Token_t & tLast = m_dTokens.Add(); tLast.m_eType = iWord ? TOK_WORD : TOK_SPACE; tLast.m_uPosition = iWord ? ++uPosition : 0; tLast.m_iStart = pTokenStart - pStartPtr; tLast.m_iLengthBytes = pLastTokenEnd - pTokenStart; tLast.m_iWordID = iWord; tLast.m_uWords = 0; if ( iWord ) m_iDocumentWords++; m_iLastWord = iWord ? m_dTokens.GetLength() - 1 : m_iLastWord; // fill word mask if ( bFillMasks && iWord ) { bool bMatch = false; int iOffset; ARRAY_FOREACH ( nWord, m_dWords ) { const char * sKeyword = &m_dKeywordsBuffer [ m_dKeywords[nWord].m_iWord ]; const Token_t & tToken = m_dWords[nWord]; switch ( m_dKeywords[nWord].m_uStar ) { case STAR_NONE: bMatch = ( iWord==tToken.m_iWordID ); break; case STAR_FRONT: iOffset = tLast.m_iLengthBytes - tToken.m_iLengthBytes; bMatch = ( iOffset>=0 ) && ( memcmp ( sKeyword, sWord + iOffset, tToken.m_iLengthBytes )==0 ); break; case STAR_BACK: bMatch = ( tLast.m_iLengthBytes>=tToken.m_iLengthBytes ) && ( memcmp ( sKeyword, sWord, tToken.m_iLengthBytes )==0 ); break; case STAR_BOTH: bMatch = strstr ( (const char *)sWord, sKeyword )!=NULL; break; } if ( bMatch ) { tLast.m_uWords |= 1UL<<nWord; m_uFoundWords |= 1UL<<nWord; } } } }
bool sphPluginReload ( const char * sName, CSphString & sError ) { #if !HAVE_DLOPEN sError = "no dlopen(), no plugins"; return false; #else // find all plugins from the given library CSphScopedLock<CSphMutex> tLock ( g_tPluginMutex ); CSphVector<PluginKey_t> dKeys; CSphVector<PluginDesc_c*> dPlugins; g_hPlugins.IterateStart(); while ( g_hPlugins.IterateNext() ) { PluginDesc_c * v = g_hPlugins.IterateGet(); if ( v->GetLibName()==sName ) { dKeys.Add ( g_hPlugins.IterateGetKey() ); dPlugins.Add ( g_hPlugins.IterateGet() ); } } // no plugins loaded? oops if ( dPlugins.GetLength()==0 ) { sError.SetSprintf ( "no active plugins loaded from %s", sName ); return false; } // load new library and check every plugin #if !USE_WINDOWS PluginLib_c * pNewLib = LoadPluginLibrary ( sName, sError, true ); #else PluginLib_c * pNewLib = LoadPluginLibrary ( sName, sError ); #endif if ( !pNewLib ) return false; // load all plugins CSphVector<PluginDesc_c*> dNewPlugins; ARRAY_FOREACH ( i, dPlugins ) { PluginDesc_c * pDesc = NULL; const SymbolDesc_t * pSym = NULL; switch ( dKeys[i].m_eType ) { case PLUGIN_RANKER: pDesc = new PluginRanker_c ( pNewLib ); pSym = g_dSymbolsRanker; break; case PLUGIN_INDEX_TOKEN_FILTER: pDesc = new PluginTokenFilter_c ( pNewLib ); pSym = g_dSymbolsTokenFilter; break; case PLUGIN_QUERY_TOKEN_FILTER: pDesc = new PluginQueryTokenFilter_c ( pNewLib ); pSym = g_dSymbolsQueryTokenFilter; break; case PLUGIN_FUNCTION: pDesc = new PluginUDF_c ( pNewLib, dPlugins[i]->GetUdfRetType() ); pSym = g_dSymbolsUDF; break; default: sphDie ( "INTERNAL ERROR: unknown plugin type %d in sphPluginReload()", (int)dKeys[i].m_eType ); return false; } if ( !PluginLoadSymbols ( pDesc, pSym, pNewLib->GetHandle(), dKeys[i].m_sName.cstr(), sError ) ) { pDesc->Release(); break; } dNewPlugins.Add ( pDesc ); }