void ExcerptGen_c::AddJunk ( int iStart, int iLength ) { int iChunkStart = iStart; for ( int i = iStart; i < iStart+iLength; i++ ) if ( sphIsSpace ( m_sBuffer.cstr () [i] ) != sphIsSpace ( m_sBuffer.cstr () [iChunkStart] ) ) { m_dTokens.Resize ( m_dTokens.GetLength () + 1 ); Token_t & tLast = m_dTokens.Last (); tLast.m_eType = TOK_SPACE; tLast.m_iStart = iChunkStart; tLast.m_iLengthBytes = i - iChunkStart; tLast.m_iWordID = 0; tLast.m_uWords = 0; iChunkStart = i; } m_dTokens.Resize ( m_dTokens.GetLength () + 1 ); Token_t & tLast = m_dTokens.Last (); tLast.m_eType = TOK_SPACE; tLast.m_iStart = iChunkStart; tLast.m_iLengthBytes = iStart + iLength - iChunkStart; tLast.m_iWordID = 0; tLast.m_uWords = 0; }
void StripStdin ( const char * sIndexAttrs, const char * sRemoveElements ) { CSphString sError; CSphHTMLStripper tStripper ( true ); if ( !tStripper.SetIndexedAttrs ( sIndexAttrs, sError ) || !tStripper.SetRemovedElements ( sRemoveElements, sError ) ) sphDie ( "failed to configure stripper: %s", sError.cstr() ); CSphVector<BYTE> dBuffer; while ( !feof(stdin) ) { char sBuffer[1024]; int iLen = fread ( sBuffer, 1, sizeof(sBuffer), stdin ); if ( !iLen ) break; int iPos = dBuffer.GetLength(); dBuffer.Resize ( iPos+iLen ); memcpy ( &dBuffer[iPos], sBuffer, iLen ); } dBuffer.Add ( 0 ); tStripper.Strip ( &dBuffer[0] ); fprintf ( stdout, "dumping stripped results...\n%s\n", &dBuffer[0] ); }
bool sphPluginParseSpec ( const CSphString & sParams, CSphVector<CSphString> & dParams, CSphString & sError ) { dParams.Resize ( 0 ); sphSplit ( dParams, sParams.cstr(), ":" ); switch ( dParams.GetLength() ) { case 0: return true; case 1: sError = "filter name required in spec string; example: \"plugins.so:myfilter\""; return false; case 2: dParams.Add ( "" ); return true; case 3: return true; } sError = "too many parts in spec string; must be in \"plugins.so:myfilter:options\" format"; return false; }
void ExcerptGen_c::AddJunk ( int iStart, int iLength, int iBoundary ) { int iChunkStart = iStart; int iSaved = 0; for ( int i = iStart; i < iStart+iLength; i++ ){ const char* buf_ptr = NULL; if(m_bUtf8){ buf_ptr = m_sBufferUTF8.cstr (); }else{ buf_ptr = m_sBuffer.cstr (); } if ( sphIsSpace ( buf_ptr[i] ) != sphIsSpace ( buf_ptr[iChunkStart] ) ) { m_dTokens.Resize ( m_dTokens.GetLength () + 1 ); Token_t & tLast = m_dTokens.Last (); tLast.m_eType = TOK_SPACE; tLast.m_iStart = iChunkStart; tLast.m_iLengthBytes = i - iChunkStart; tLast.m_iWordID = 0; tLast.m_uWords = 0; iChunkStart = i; iSaved += tLast.m_iLengthBytes; if ( iBoundary != -1 && iSaved > iBoundary - iStart ) { AddBoundary(); iBoundary = -1; } } } m_dTokens.Resize ( m_dTokens.GetLength () + 1 ); Token_t & tLast = m_dTokens.Last (); tLast.m_eType = TOK_SPACE; tLast.m_iStart = iChunkStart; tLast.m_iLengthBytes = iStart + iLength - iChunkStart; tLast.m_iWordID = 0; tLast.m_uWords = 0; if ( iBoundary != -1 ) AddBoundary(); }
void ExcerptGen_c::AddBoundary() { m_dTokens.Resize ( m_dTokens.GetLength () + 1 ); Token_t & tLast = m_dTokens.Last (); tLast.m_eType = TOK_BREAK; tLast.m_iStart = 0; tLast.m_iLengthBytes = 0; tLast.m_iWordID = 0; tLast.m_uWords = 0; }
void ExcerptGen_c::TokenizeQuery ( const ExcerptQuery_t & tQuery, CSphDict * pDict, ISphTokenizer * pTokenizer ) { const bool bUtf8 = pTokenizer->IsUtf8(); // tokenize query words int iWordsLength = strlen ( tQuery.m_sWords.cstr() ); m_dKeywords.Reserve ( MAX_HIGHLIGHT_WORDS ); BYTE * sWord; int iKwIndex = 0; pTokenizer->SetBuffer ( (BYTE *)tQuery.m_sWords.cstr(), iWordsLength ); while ( ( sWord = pTokenizer->GetToken() )!=NULL ) { SphWordID_t iWord = pDict->GetWordID ( sWord ); if ( iWord ) { Token_t & tLast = m_dWords.Add(); tLast.m_eType = TOK_WORD; tLast.m_iWordID = iWord; tLast.m_iLengthBytes = strlen ( (const char *)sWord ); tLast.m_iLengthCP = bUtf8 ? sphUTF8Len ( (const char *)sWord ) : tLast.m_iLengthBytes; // store keyword Keyword_t & kwLast = m_dKeywords.Add(); kwLast.m_iLength = tLast.m_iLengthCP; // find stars bool bStarBack = ( *pTokenizer->GetTokenEnd()=='*' ); bool bStarFront = ( pTokenizer->GetTokenStart()!=pTokenizer->GetBufferPtr() ) && ( pTokenizer->GetTokenStart()[-1]=='*' ); kwLast.m_uStar = ( bStarFront ? STAR_FRONT : 0 ) | ( bStarBack ? STAR_BACK : 0 ); // store token const int iEndIndex = iKwIndex + tLast.m_iLengthBytes + 1; m_dKeywordsBuffer.Resize ( iEndIndex ); kwLast.m_iWord = iKwIndex; strcpy ( &m_dKeywordsBuffer [ iKwIndex ], (const char *)sWord ); // NOLINT iKwIndex = iEndIndex; if ( m_dWords.GetLength()==MAX_HIGHLIGHT_WORDS ) break; } } }
char * ExcerptGen_c::BuildExcerpt ( const ExcerptQuery_t & q, CSphDict * pDict, ISphTokenizer * pTokenizer ) { m_dTokens.Reserve ( 1024 ); m_sBuffer = q.m_sSource; const bool bUtf8 = pTokenizer->IsUtf8(); m_bUtf8 = bUtf8; // tokenize query words int iWordsLength = strlen ( q.m_sWords.cstr() ); CSphVector<char> dKwBuffer ( iWordsLength ); CSphVector<Keyword_t> dKeywords; dKeywords.Reserve ( MAX_HIGHLIGHT_WORDS ); BYTE * sWord; int iKwIndex = 0; pTokenizer->SetBuffer ( (BYTE*)q.m_sWords.cstr(), iWordsLength ); while ( ( sWord = pTokenizer->GetToken() ) != NULL ) { SphWordID_t iWord = pDict->GetWordID ( sWord ); if ( iWord ) { m_dWords.Resize ( m_dWords.GetLength () + 1 ); Token_t & tLast = m_dWords.Last (); tLast.m_eType = TOK_WORD; tLast.m_iWordID = iWord; tLast.m_iLengthBytes = strlen ( (const char *)sWord ); tLast.m_iLengthCP = bUtf8 ? sphUTF8Len ( (const char *)sWord ) : tLast.m_iLengthBytes; // store keyword dKeywords.Resize( dKeywords.GetLength() + 1 ); Keyword_t & kwLast = dKeywords.Last (); // find stars bool bStarBack = *pTokenizer->GetTokenEnd() == '*'; bool bStarFront = ( pTokenizer->GetTokenStart() != pTokenizer->GetBufferPtr() ) && pTokenizer->GetTokenStart()[-1] == '*'; kwLast.m_uStar = ( bStarFront ? STAR_FRONT : 0 ) | ( bStarBack ? STAR_BACK : 0 ); // store token const int iEndIndex = iKwIndex + tLast.m_iLengthBytes + 1; dKwBuffer.Resize ( iEndIndex ); kwLast.m_iWord = iKwIndex; strcpy ( &dKwBuffer [ iKwIndex ], (const char *)sWord ); iKwIndex = iEndIndex; if ( m_dWords.GetLength() == MAX_HIGHLIGHT_WORDS ) break; } } // tokenize document pTokenizer->SetBuffer ( (BYTE*)q.m_sSource.cstr (), strlen ( q.m_sSource.cstr () ) ); const char * pStartPtr = pTokenizer->GetBufferPtr (); const char * pLastTokenEnd = pStartPtr; //assign utf-8 m_sBufferUTF8 = pStartPtr; while ( ( sWord = pTokenizer->GetToken() ) != NULL ) { const char * pTokenStart = pTokenizer->GetTokenStart (); if ( pTokenStart != pStartPtr ) AddJunk ( pLastTokenEnd - pStartPtr, pTokenStart - pLastTokenEnd, pTokenizer->GetBoundary() ? pTokenizer->GetBoundaryOffset() : -1 ); SphWordID_t iWord = pDict->GetWordID ( sWord ); pLastTokenEnd = pTokenizer->GetTokenEnd (); m_dTokens.Resize ( m_dTokens.GetLength () + 1 ); Token_t & tLast = m_dTokens.Last (); tLast.m_eType = iWord ? TOK_WORD : TOK_SPACE; tLast.m_iStart = pTokenStart - pStartPtr; tLast.m_iLengthBytes = pLastTokenEnd - pTokenStart; tLast.m_iWordID = iWord; tLast.m_uWords = 0; // fill word mask if ( iWord ) { bool bMatch = false; int iOffset; ARRAY_FOREACH ( nWord, m_dWords ) { const char * keyword = &dKwBuffer [ dKeywords[nWord].m_iWord ]; const Token_t & token = m_dWords[nWord]; switch ( dKeywords[nWord].m_uStar ) { case STAR_NONE: bMatch = iWord == token.m_iWordID; break; case STAR_FRONT: iOffset = tLast.m_iLengthBytes - token.m_iLengthBytes; bMatch = (iOffset >= 0) && ( memcmp( keyword, sWord + iOffset, token.m_iLengthBytes ) == 0 ); break; case STAR_BACK: bMatch = ( tLast.m_iLengthBytes >= token.m_iLengthBytes ) && ( memcmp( keyword, sWord, token.m_iLengthBytes ) == 0 ); break; case STAR_BOTH: bMatch = strstr( (const char *)sWord, keyword ) != NULL; break; } if ( bMatch ) tLast.m_uWords |= (1UL << nWord); } } } // last space if any if ( pLastTokenEnd != pTokenizer->GetBufferEnd () ) { int iOffset = pTokenizer->GetBoundary() ? pTokenizer->GetBoundaryOffset() : -1; AddJunk ( pLastTokenEnd - pStartPtr, pTokenizer->GetBufferEnd () - pLastTokenEnd, iOffset ); } m_dTokens.Resize ( m_dTokens.GetLength () + 1 ); Token_t & tLast = m_dTokens.Last (); tLast.m_eType = TOK_NONE; tLast.m_iStart = 0; tLast.m_iLengthBytes = 0; tLast.m_iWordID = 0; tLast.m_uWords = 0; // sum token lengths int iSourceCodes = 0; ARRAY_FOREACH ( i, m_dTokens ) { m_dTokens [i].m_iWeight = 0; if ( m_dTokens [i].m_iLengthBytes ) { if ( bUtf8 ) { //int iLen = sphUTF8Len ( m_sBuffer.SubString ( m_dTokens[i].m_iStart, m_dTokens[i].m_iLengthBytes ).cstr() ); int iLen = sphUTF8Len ( m_sBufferUTF8.SubString ( m_dTokens[i].m_iStart, m_dTokens[i].m_iLengthBytes ).cstr() ); m_dTokens[i].m_iLengthCP = iLen; } else m_dTokens[i].m_iLengthCP = m_dTokens[i].m_iLengthBytes; iSourceCodes += m_dTokens[i].m_iLengthCP; } else m_dTokens [i].m_iLengthCP = 0; }
bool CSphConfigParser::TryToExec ( char * pBuffer, char * pEnd, const char * szFilename, CSphVector<char> & dResult ) { int dPipe[2] = { -1, -1 }; if ( pipe ( dPipe ) ) { snprintf ( m_sError, sizeof ( m_sError ), "pipe() failed (error=%s)", strerror(errno) ); return false; } pBuffer = trim ( pBuffer ); int iRead = dPipe [0]; int iWrite = dPipe [1]; signal ( SIGCHLD, sigchld ); int iChild = fork(); if ( iChild == 0 ) { close ( iRead ); close ( STDOUT_FILENO ); dup2 ( iWrite, STDOUT_FILENO ); char * pPtr = pBuffer; char * pArgs = NULL; while ( *pPtr ) { if ( sphIsSpace ( *pPtr ) ) { *pPtr = '\0'; pArgs = trim ( pPtr+1 ); break; } pPtr++; } if ( pArgs ) execl ( pBuffer, pBuffer, pArgs, szFilename, NULL ); else execl ( pBuffer, pBuffer, szFilename, NULL ); exit ( 1 ); } else if ( iChild == -1 ) { snprintf ( m_sError, sizeof ( m_sError ), "fork failed (error=%s)", strerror(errno) ); return false; } close ( iWrite ); int iBytesRead, iTotalRead = 0; const int BUFFER_SIZE = 65536; dResult.Reset (); do { dResult.Resize ( iTotalRead + BUFFER_SIZE ); iBytesRead = read ( iRead, (void*)&(dResult [iTotalRead]), BUFFER_SIZE ); iTotalRead += iBytesRead; } while ( iBytesRead > 0 ); int iStatus; wait ( &iStatus ); iStatus = (signed char) WEXITSTATUS (iStatus); if ( iStatus ) { snprintf ( m_sError, sizeof ( m_sError ), "error executing '%s'", pBuffer ); return false; } if ( iBytesRead < 0 ) { snprintf ( m_sError, sizeof ( m_sError ), "pipe read error (error=%s)", strerror(errno) ); return false; } dResult.Resize ( iTotalRead + 1 ); dResult [iTotalRead] = '\0'; return true; }