Пример #1
0
void ExcerptGen_c::AddJunk ( int iStart, int iLength )
{
    int iChunkStart = iStart;

    for ( int i = iStart; i < iStart+iLength; i++ )
        if ( sphIsSpace ( m_sBuffer.cstr () [i] ) != sphIsSpace ( m_sBuffer.cstr () [iChunkStart] ) )
        {
            m_dTokens.Resize ( m_dTokens.GetLength () + 1 );
            Token_t & tLast = m_dTokens.Last ();
            tLast.m_eType   = TOK_SPACE;
            tLast.m_iStart	= iChunkStart;
            tLast.m_iLengthBytes = i - iChunkStart;
            tLast.m_iWordID = 0;
            tLast.m_uWords = 0;

            iChunkStart = i;
        }

    m_dTokens.Resize ( m_dTokens.GetLength () + 1 );
    Token_t & tLast = m_dTokens.Last ();
    tLast.m_eType   = TOK_SPACE;
    tLast.m_iStart	= iChunkStart;
    tLast.m_iLengthBytes = iStart + iLength - iChunkStart;
    tLast.m_iWordID = 0;
    tLast.m_uWords = 0;
}
Пример #2
0
void StripStdin ( const char * sIndexAttrs, const char * sRemoveElements )
{
	CSphString sError;
	CSphHTMLStripper tStripper ( true );
	if ( !tStripper.SetIndexedAttrs ( sIndexAttrs, sError )
		|| !tStripper.SetRemovedElements ( sRemoveElements, sError ) )
			sphDie ( "failed to configure stripper: %s", sError.cstr() );

	CSphVector<BYTE> dBuffer;
	while ( !feof(stdin) )
	{
		char sBuffer[1024];
		int iLen = fread ( sBuffer, 1, sizeof(sBuffer), stdin );
		if ( !iLen )
			break;

		int iPos = dBuffer.GetLength();
		dBuffer.Resize ( iPos+iLen );
		memcpy ( &dBuffer[iPos], sBuffer, iLen );
	}
	dBuffer.Add ( 0 );

	tStripper.Strip ( &dBuffer[0] );
	fprintf ( stdout, "dumping stripped results...\n%s\n", &dBuffer[0] );
}
Пример #3
0
bool sphPluginParseSpec ( const CSphString & sParams, CSphVector<CSphString> & dParams, CSphString & sError )
{
	dParams.Resize ( 0 );
	sphSplit ( dParams, sParams.cstr(), ":" );

	switch ( dParams.GetLength() )
	{
	case 0:
		return true;

	case 1:
		sError = "filter name required in spec string; example: \"plugins.so:myfilter\"";
		return false;

	case 2:
		dParams.Add ( "" );
		return true;

	case 3:
		return true;
	}

	sError = "too many parts in spec string; must be in \"plugins.so:myfilter:options\" format";
	return false;
}
Пример #4
0
void ExcerptGen_c::AddJunk ( int iStart, int iLength, int iBoundary )
{
	int iChunkStart = iStart;
	int iSaved = 0;

	for ( int i = iStart; i < iStart+iLength; i++ ){
		const char* buf_ptr = NULL;
		if(m_bUtf8){
			buf_ptr = m_sBufferUTF8.cstr ();
		}else{
			buf_ptr = m_sBuffer.cstr ();
		}
		if ( sphIsSpace (  buf_ptr[i] ) != sphIsSpace ( buf_ptr[iChunkStart] ) )
		{
			m_dTokens.Resize ( m_dTokens.GetLength () + 1 );
			Token_t & tLast = m_dTokens.Last ();
			tLast.m_eType   = TOK_SPACE;
			tLast.m_iStart	= iChunkStart;
			tLast.m_iLengthBytes = i - iChunkStart;
			tLast.m_iWordID = 0;
			tLast.m_uWords = 0;

			iChunkStart = i;
			iSaved += tLast.m_iLengthBytes;

			if ( iBoundary != -1 && iSaved > iBoundary - iStart )
			{
				AddBoundary();
				iBoundary = -1;
			}
		}
	}

	m_dTokens.Resize ( m_dTokens.GetLength () + 1 );
	Token_t & tLast = m_dTokens.Last ();
	tLast.m_eType   = TOK_SPACE;
	tLast.m_iStart	= iChunkStart;
	tLast.m_iLengthBytes = iStart + iLength - iChunkStart;
	tLast.m_iWordID = 0;
	tLast.m_uWords = 0;

	if ( iBoundary != -1 ) AddBoundary();
}
Пример #5
0
void ExcerptGen_c::AddBoundary()
{
	m_dTokens.Resize ( m_dTokens.GetLength () + 1 );
	Token_t & tLast = m_dTokens.Last ();
	tLast.m_eType = TOK_BREAK;
	tLast.m_iStart = 0;
	tLast.m_iLengthBytes = 0;
	tLast.m_iWordID = 0;
	tLast.m_uWords = 0;
}
void ExcerptGen_c::TokenizeQuery ( const ExcerptQuery_t & tQuery, CSphDict * pDict, ISphTokenizer * pTokenizer )
{
	const bool bUtf8 = pTokenizer->IsUtf8();

	// tokenize query words
	int iWordsLength = strlen ( tQuery.m_sWords.cstr() );

	m_dKeywords.Reserve ( MAX_HIGHLIGHT_WORDS );

	BYTE * sWord;
	int iKwIndex = 0;

	pTokenizer->SetBuffer ( (BYTE *)tQuery.m_sWords.cstr(), iWordsLength );
	while ( ( sWord = pTokenizer->GetToken() )!=NULL )
	{
		SphWordID_t iWord = pDict->GetWordID ( sWord );
		if ( iWord )
		{
			Token_t & tLast = m_dWords.Add();
			tLast.m_eType = TOK_WORD;
			tLast.m_iWordID = iWord;
			tLast.m_iLengthBytes = strlen ( (const char *)sWord );
			tLast.m_iLengthCP = bUtf8 ? sphUTF8Len ( (const char *)sWord ) : tLast.m_iLengthBytes;

			// store keyword
			Keyword_t & kwLast = m_dKeywords.Add();
			kwLast.m_iLength = tLast.m_iLengthCP;

			// find stars
			bool bStarBack = ( *pTokenizer->GetTokenEnd()=='*' );
			bool bStarFront = ( pTokenizer->GetTokenStart()!=pTokenizer->GetBufferPtr() ) &&
				( pTokenizer->GetTokenStart()[-1]=='*' );
			kwLast.m_uStar = ( bStarFront ? STAR_FRONT : 0 ) | ( bStarBack ? STAR_BACK : 0 );

			// store token
			const int iEndIndex = iKwIndex + tLast.m_iLengthBytes + 1;
			m_dKeywordsBuffer.Resize ( iEndIndex );
			kwLast.m_iWord = iKwIndex;
			strcpy ( &m_dKeywordsBuffer [ iKwIndex ], (const char *)sWord ); // NOLINT
			iKwIndex = iEndIndex;

			if ( m_dWords.GetLength()==MAX_HIGHLIGHT_WORDS )
				break;
		}
	}
}
Пример #7
0
char * ExcerptGen_c::BuildExcerpt ( const ExcerptQuery_t & q, CSphDict * pDict, ISphTokenizer * pTokenizer )
{
	m_dTokens.Reserve ( 1024 );
	m_sBuffer = q.m_sSource;

	const bool bUtf8 = pTokenizer->IsUtf8();
	m_bUtf8 = bUtf8;
	// tokenize query words
	int iWordsLength = strlen ( q.m_sWords.cstr() );

	CSphVector<char> dKwBuffer ( iWordsLength );
	CSphVector<Keyword_t> dKeywords;
	dKeywords.Reserve ( MAX_HIGHLIGHT_WORDS );

	BYTE * sWord;
	int iKwIndex = 0;

	pTokenizer->SetBuffer ( (BYTE*)q.m_sWords.cstr(), iWordsLength );
	while ( ( sWord = pTokenizer->GetToken() ) != NULL )
	{
		SphWordID_t iWord = pDict->GetWordID ( sWord );
		if ( iWord )
		{
			m_dWords.Resize ( m_dWords.GetLength () + 1 );
			Token_t & tLast = m_dWords.Last ();
			tLast.m_eType = TOK_WORD;
			tLast.m_iWordID = iWord;
			tLast.m_iLengthBytes = strlen ( (const char *)sWord );
			tLast.m_iLengthCP = bUtf8 ? sphUTF8Len ( (const char *)sWord ) : tLast.m_iLengthBytes;

			// store keyword
			dKeywords.Resize( dKeywords.GetLength() + 1 );
			Keyword_t & kwLast = dKeywords.Last ();

			// find stars
			bool bStarBack = *pTokenizer->GetTokenEnd() == '*';
			bool bStarFront = ( pTokenizer->GetTokenStart() != pTokenizer->GetBufferPtr() ) &&
				pTokenizer->GetTokenStart()[-1] == '*';
			kwLast.m_uStar = ( bStarFront ? STAR_FRONT : 0 ) | ( bStarBack ? STAR_BACK : 0 );

			// store token
			const int iEndIndex = iKwIndex + tLast.m_iLengthBytes + 1;
			dKwBuffer.Resize ( iEndIndex );
			kwLast.m_iWord = iKwIndex;
			strcpy ( &dKwBuffer [ iKwIndex ], (const char *)sWord );
			iKwIndex = iEndIndex;

			if ( m_dWords.GetLength() == MAX_HIGHLIGHT_WORDS )
				break;
		}
	}

	// tokenize document
	pTokenizer->SetBuffer ( (BYTE*)q.m_sSource.cstr (), strlen ( q.m_sSource.cstr () ) );

	const char * pStartPtr = pTokenizer->GetBufferPtr ();
	const char * pLastTokenEnd = pStartPtr;

	//assign utf-8
	m_sBufferUTF8 = pStartPtr;

	while ( ( sWord = pTokenizer->GetToken() ) != NULL )
	{
		const char * pTokenStart = pTokenizer->GetTokenStart ();

		if ( pTokenStart != pStartPtr )
			AddJunk ( pLastTokenEnd - pStartPtr,
					  pTokenStart - pLastTokenEnd,
					  pTokenizer->GetBoundary() ? pTokenizer->GetBoundaryOffset() : -1 );

		SphWordID_t iWord = pDict->GetWordID ( sWord );

		pLastTokenEnd = pTokenizer->GetTokenEnd ();

		m_dTokens.Resize ( m_dTokens.GetLength () + 1 );
		Token_t & tLast = m_dTokens.Last ();
		tLast.m_eType	= iWord ? TOK_WORD : TOK_SPACE;
		tLast.m_iStart  = pTokenStart - pStartPtr;
		tLast.m_iLengthBytes = pLastTokenEnd - pTokenStart;
		tLast.m_iWordID = iWord;
		tLast.m_uWords = 0;

		// fill word mask
		if ( iWord )
		{
			bool bMatch = false;
			int iOffset;

			ARRAY_FOREACH ( nWord, m_dWords )
			{
				const char * keyword = &dKwBuffer [ dKeywords[nWord].m_iWord ];
				const Token_t & token = m_dWords[nWord];

				switch ( dKeywords[nWord].m_uStar )
				{
				case STAR_NONE:
					bMatch = iWord == token.m_iWordID;
					break;

				case STAR_FRONT:
					iOffset = tLast.m_iLengthBytes - token.m_iLengthBytes;
					bMatch = (iOffset >= 0) &&
						( memcmp( keyword, sWord + iOffset, token.m_iLengthBytes ) == 0 );
					break;

				case STAR_BACK:
					bMatch = ( tLast.m_iLengthBytes >= token.m_iLengthBytes ) &&
						( memcmp( keyword, sWord, token.m_iLengthBytes ) == 0 );
					break;

				case STAR_BOTH:
					bMatch = strstr( (const char *)sWord, keyword ) != NULL;
					break;
				}

				if ( bMatch )
					tLast.m_uWords |= (1UL << nWord);
			}
		}
	}

	// last space if any
	if ( pLastTokenEnd != pTokenizer->GetBufferEnd () )
	{
		int iOffset = pTokenizer->GetBoundary() ? pTokenizer->GetBoundaryOffset() : -1;
		AddJunk ( pLastTokenEnd - pStartPtr, pTokenizer->GetBufferEnd () - pLastTokenEnd, iOffset );
	}
	
	m_dTokens.Resize ( m_dTokens.GetLength () + 1 );
	Token_t & tLast = m_dTokens.Last ();
	tLast.m_eType   = TOK_NONE;
	tLast.m_iStart  = 0;
	tLast.m_iLengthBytes = 0;
	tLast.m_iWordID = 0;
	tLast.m_uWords = 0;

	// sum token lengths
	int iSourceCodes = 0;
	ARRAY_FOREACH ( i, m_dTokens )
	{
		m_dTokens [i].m_iWeight = 0;

		if ( m_dTokens [i].m_iLengthBytes )
		{
			if ( bUtf8 )
			{
				//int iLen = sphUTF8Len ( m_sBuffer.SubString ( m_dTokens[i].m_iStart, m_dTokens[i].m_iLengthBytes ).cstr() );
				int iLen = sphUTF8Len ( m_sBufferUTF8.SubString ( m_dTokens[i].m_iStart, m_dTokens[i].m_iLengthBytes ).cstr() );
				m_dTokens[i].m_iLengthCP = iLen;
			}
			else
				m_dTokens[i].m_iLengthCP = m_dTokens[i].m_iLengthBytes;
			iSourceCodes += m_dTokens[i].m_iLengthCP;
		}
		else
			m_dTokens [i].m_iLengthCP = 0;
	}
Пример #8
0
bool CSphConfigParser::TryToExec ( char * pBuffer, char * pEnd, const char * szFilename, CSphVector<char> & dResult )
{
	int dPipe[2] = { -1, -1 };

	if ( pipe ( dPipe ) )
	{
		snprintf ( m_sError, sizeof ( m_sError ), "pipe() failed (error=%s)", strerror(errno) );
		return false;
	}

	pBuffer = trim ( pBuffer );

	int iRead  = dPipe [0];
	int iWrite = dPipe [1];

	signal ( SIGCHLD, sigchld );

	int iChild = fork();

	if ( iChild == 0 )
	{
		close ( iRead );
		close ( STDOUT_FILENO );
		dup2 ( iWrite, STDOUT_FILENO );

		char * pPtr = pBuffer;
		char * pArgs = NULL;
		while ( *pPtr )
		{
			if ( sphIsSpace ( *pPtr ) )
			{
				*pPtr = '\0';
				pArgs = trim ( pPtr+1 );
				break;
			}

			pPtr++;
		}
		
		if ( pArgs )
			execl ( pBuffer, pBuffer, pArgs, szFilename, NULL );
		else
			execl ( pBuffer, pBuffer, szFilename, NULL );

		exit ( 1 );
	}
	else
		if ( iChild == -1 )
		{
			snprintf ( m_sError, sizeof ( m_sError ), "fork failed (error=%s)", strerror(errno) );
			return false;
		}

	close ( iWrite );

	int iBytesRead, iTotalRead = 0;
	const int BUFFER_SIZE = 65536;
	
	dResult.Reset ();

	do
	{
		dResult.Resize ( iTotalRead + BUFFER_SIZE );
		iBytesRead = read ( iRead, (void*)&(dResult [iTotalRead]), BUFFER_SIZE );
		iTotalRead += iBytesRead;
	}
	while ( iBytesRead > 0 );

	int iStatus;
	wait ( &iStatus );
	iStatus = (signed char) WEXITSTATUS (iStatus);

	if ( iStatus )
	{
		snprintf ( m_sError, sizeof ( m_sError ), "error executing '%s'", pBuffer );
		return false;
	}

	if ( iBytesRead < 0  )
	{
		snprintf ( m_sError, sizeof ( m_sError ), "pipe read error (error=%s)", strerror(errno) );
		return false;
	}

	dResult.Resize ( iTotalRead + 1 );
	dResult [iTotalRead] = '\0';

	return true;
}