예제 #1
0
bool CISpellDict::Load ( const char * szFilename )
{
	if ( !szFilename )
		return false;

	m_dEntries.Reset ();
	m_dEntries.Reserve ( 131072 );

	FILE * pFile = fopen ( szFilename, "rt" );
	if ( !pFile )
		return false;

	char szWordBuffer [MAX_STR_LENGTH];

	while ( !feof ( pFile ) )
	{
		char * szResult = fgets ( szWordBuffer, MAX_STR_LENGTH, pFile );
		if ( !szResult )
			break;

		int iPos = strlen ( szWordBuffer ) - 1;
		while ( iPos>=0 && isspace ( (unsigned char)szWordBuffer[iPos] ) )
			szWordBuffer [iPos--] = '\0';

		CISpellDictWord Word;

		char * szPosition = strchr ( szWordBuffer, '/' );
		if ( !szPosition )
		{
			szPosition = szWordBuffer;
			while ( *szPosition && !isspace ( (unsigned char)*szPosition ) )
				++szPosition;

			*szPosition = '\0';
			Word.m_sWord = szWordBuffer;

		} else
		{
			*szPosition = '\0';
			Word.m_sWord = szWordBuffer;
			++szPosition;
			char * szFlags = szPosition;
			while ( *szPosition && !isspace ( (unsigned char)*szPosition ) )
				++szPosition;

			*szPosition = '\0';
			Word.m_sFlags = szFlags;
		}

		m_dEntries.Add ( Word );
	}

	fclose ( pFile );

	return true;
}
예제 #2
0
void CSphStopwordBuilderDict::Save ( const char * sOutput, int iTop, bool bFreqs )
{
	FILE * fp = fopen ( sOutput, "w+" );
	if ( !fp )
		return;

	CSphVector<Word_t> dTop;
	dTop.Reserve ( 1024 );

	const CSphMTFHashEntry<int> * it;
	HASH_FOREACH ( it, m_hWords )
	{
		Word_t t;
		t.m_sWord = it->m_sKey.cstr();
		t.m_iCount = it->m_tValue;
		dTop.Add ( t );
	}
void ExcerptGen_c::TokenizeQuery ( const ExcerptQuery_t & tQuery, CSphDict * pDict, ISphTokenizer * pTokenizer )
{
	const bool bUtf8 = pTokenizer->IsUtf8();

	// tokenize query words
	int iWordsLength = strlen ( tQuery.m_sWords.cstr() );

	m_dKeywords.Reserve ( MAX_HIGHLIGHT_WORDS );

	BYTE * sWord;
	int iKwIndex = 0;

	pTokenizer->SetBuffer ( (BYTE *)tQuery.m_sWords.cstr(), iWordsLength );
	while ( ( sWord = pTokenizer->GetToken() )!=NULL )
	{
		SphWordID_t iWord = pDict->GetWordID ( sWord );
		if ( iWord )
		{
			Token_t & tLast = m_dWords.Add();
			tLast.m_eType = TOK_WORD;
			tLast.m_iWordID = iWord;
			tLast.m_iLengthBytes = strlen ( (const char *)sWord );
			tLast.m_iLengthCP = bUtf8 ? sphUTF8Len ( (const char *)sWord ) : tLast.m_iLengthBytes;

			// store keyword
			Keyword_t & kwLast = m_dKeywords.Add();
			kwLast.m_iLength = tLast.m_iLengthCP;

			// find stars
			bool bStarBack = ( *pTokenizer->GetTokenEnd()=='*' );
			bool bStarFront = ( pTokenizer->GetTokenStart()!=pTokenizer->GetBufferPtr() ) &&
				( pTokenizer->GetTokenStart()[-1]=='*' );
			kwLast.m_uStar = ( bStarFront ? STAR_FRONT : 0 ) | ( bStarBack ? STAR_BACK : 0 );

			// store token
			const int iEndIndex = iKwIndex + tLast.m_iLengthBytes + 1;
			m_dKeywordsBuffer.Resize ( iEndIndex );
			kwLast.m_iWord = iKwIndex;
			strcpy ( &m_dKeywordsBuffer [ iKwIndex ], (const char *)sWord ); // NOLINT
			iKwIndex = iEndIndex;

			if ( m_dWords.GetLength()==MAX_HIGHLIGHT_WORDS )
				break;
		}
	}
}
예제 #4
0
char * ExcerptGen_c::BuildExcerpt ( const ExcerptQuery_t & q, CSphDict * pDict, ISphTokenizer * pTokenizer )
{
	m_dTokens.Reserve ( 1024 );
	m_sBuffer = q.m_sSource;

	const bool bUtf8 = pTokenizer->IsUtf8();
	m_bUtf8 = bUtf8;
	// tokenize query words
	int iWordsLength = strlen ( q.m_sWords.cstr() );

	CSphVector<char> dKwBuffer ( iWordsLength );
	CSphVector<Keyword_t> dKeywords;
	dKeywords.Reserve ( MAX_HIGHLIGHT_WORDS );

	BYTE * sWord;
	int iKwIndex = 0;

	pTokenizer->SetBuffer ( (BYTE*)q.m_sWords.cstr(), iWordsLength );
	while ( ( sWord = pTokenizer->GetToken() ) != NULL )
	{
		SphWordID_t iWord = pDict->GetWordID ( sWord );
		if ( iWord )
		{
			m_dWords.Resize ( m_dWords.GetLength () + 1 );
			Token_t & tLast = m_dWords.Last ();
			tLast.m_eType = TOK_WORD;
			tLast.m_iWordID = iWord;
			tLast.m_iLengthBytes = strlen ( (const char *)sWord );
			tLast.m_iLengthCP = bUtf8 ? sphUTF8Len ( (const char *)sWord ) : tLast.m_iLengthBytes;

			// store keyword
			dKeywords.Resize( dKeywords.GetLength() + 1 );
			Keyword_t & kwLast = dKeywords.Last ();

			// find stars
			bool bStarBack = *pTokenizer->GetTokenEnd() == '*';
			bool bStarFront = ( pTokenizer->GetTokenStart() != pTokenizer->GetBufferPtr() ) &&
				pTokenizer->GetTokenStart()[-1] == '*';
			kwLast.m_uStar = ( bStarFront ? STAR_FRONT : 0 ) | ( bStarBack ? STAR_BACK : 0 );

			// store token
			const int iEndIndex = iKwIndex + tLast.m_iLengthBytes + 1;
			dKwBuffer.Resize ( iEndIndex );
			kwLast.m_iWord = iKwIndex;
			strcpy ( &dKwBuffer [ iKwIndex ], (const char *)sWord );
			iKwIndex = iEndIndex;

			if ( m_dWords.GetLength() == MAX_HIGHLIGHT_WORDS )
				break;
		}
	}

	// tokenize document
	pTokenizer->SetBuffer ( (BYTE*)q.m_sSource.cstr (), strlen ( q.m_sSource.cstr () ) );

	const char * pStartPtr = pTokenizer->GetBufferPtr ();
	const char * pLastTokenEnd = pStartPtr;

	//assign utf-8
	m_sBufferUTF8 = pStartPtr;

	while ( ( sWord = pTokenizer->GetToken() ) != NULL )
	{
		const char * pTokenStart = pTokenizer->GetTokenStart ();

		if ( pTokenStart != pStartPtr )
			AddJunk ( pLastTokenEnd - pStartPtr,
					  pTokenStart - pLastTokenEnd,
					  pTokenizer->GetBoundary() ? pTokenizer->GetBoundaryOffset() : -1 );

		SphWordID_t iWord = pDict->GetWordID ( sWord );

		pLastTokenEnd = pTokenizer->GetTokenEnd ();

		m_dTokens.Resize ( m_dTokens.GetLength () + 1 );
		Token_t & tLast = m_dTokens.Last ();
		tLast.m_eType	= iWord ? TOK_WORD : TOK_SPACE;
		tLast.m_iStart  = pTokenStart - pStartPtr;
		tLast.m_iLengthBytes = pLastTokenEnd - pTokenStart;
		tLast.m_iWordID = iWord;
		tLast.m_uWords = 0;

		// fill word mask
		if ( iWord )
		{
			bool bMatch = false;
			int iOffset;

			ARRAY_FOREACH ( nWord, m_dWords )
			{
				const char * keyword = &dKwBuffer [ dKeywords[nWord].m_iWord ];
				const Token_t & token = m_dWords[nWord];

				switch ( dKeywords[nWord].m_uStar )
				{
				case STAR_NONE:
					bMatch = iWord == token.m_iWordID;
					break;

				case STAR_FRONT:
					iOffset = tLast.m_iLengthBytes - token.m_iLengthBytes;
					bMatch = (iOffset >= 0) &&
						( memcmp( keyword, sWord + iOffset, token.m_iLengthBytes ) == 0 );
					break;

				case STAR_BACK:
					bMatch = ( tLast.m_iLengthBytes >= token.m_iLengthBytes ) &&
						( memcmp( keyword, sWord, token.m_iLengthBytes ) == 0 );
					break;

				case STAR_BOTH:
					bMatch = strstr( (const char *)sWord, keyword ) != NULL;
					break;
				}

				if ( bMatch )
					tLast.m_uWords |= (1UL << nWord);
			}
		}
	}

	// last space if any
	if ( pLastTokenEnd != pTokenizer->GetBufferEnd () )
	{
		int iOffset = pTokenizer->GetBoundary() ? pTokenizer->GetBoundaryOffset() : -1;
		AddJunk ( pLastTokenEnd - pStartPtr, pTokenizer->GetBufferEnd () - pLastTokenEnd, iOffset );
	}
	
	m_dTokens.Resize ( m_dTokens.GetLength () + 1 );
	Token_t & tLast = m_dTokens.Last ();
	tLast.m_eType   = TOK_NONE;
	tLast.m_iStart  = 0;
	tLast.m_iLengthBytes = 0;
	tLast.m_iWordID = 0;
	tLast.m_uWords = 0;

	// sum token lengths
	int iSourceCodes = 0;
	ARRAY_FOREACH ( i, m_dTokens )
	{
		m_dTokens [i].m_iWeight = 0;

		if ( m_dTokens [i].m_iLengthBytes )
		{
			if ( bUtf8 )
			{
				//int iLen = sphUTF8Len ( m_sBuffer.SubString ( m_dTokens[i].m_iStart, m_dTokens[i].m_iLengthBytes ).cstr() );
				int iLen = sphUTF8Len ( m_sBufferUTF8.SubString ( m_dTokens[i].m_iStart, m_dTokens[i].m_iLengthBytes ).cstr() );
				m_dTokens[i].m_iLengthCP = iLen;
			}
			else
				m_dTokens[i].m_iLengthCP = m_dTokens[i].m_iLengthBytes;
			iSourceCodes += m_dTokens[i].m_iLengthCP;
		}
		else
			m_dTokens [i].m_iLengthCP = 0;
	}
void ExcerptGen_c::TokenizeDocument ( char * pData, CSphDict * pDict, ISphTokenizer * pTokenizer, bool bFillMasks, bool bRetainHtml )
{
	m_iDocumentWords = 0;
	m_dTokens.Reserve ( 1024 );
	m_sBuffer = pData;

	pTokenizer->SetBuffer ( (BYTE*)pData, strlen(pData) );

	const char * pStartPtr = pTokenizer->GetBufferPtr ();
	const char * pLastTokenEnd = pStartPtr;

	if ( bRetainHtml )
		pTokenizer->AddSpecials ( "<" );

	BYTE * sWord;
	DWORD uPosition = 0; // hit position in document
	while ( ( sWord = pTokenizer->GetToken() )!=NULL )
	{
		if ( pTokenizer->TokenIsBlended() )
			continue;

		const char * pTokenStart = pTokenizer->GetTokenStart ();

		if ( pTokenStart!=pStartPtr && pTokenStart>pLastTokenEnd )
			AddJunk ( pLastTokenEnd - pStartPtr,
				pTokenStart - pLastTokenEnd,
				pTokenizer->GetBoundary() ? pTokenizer->GetBoundaryOffset() : -1 );

		if ( bRetainHtml && *pTokenStart=='<' )
		{
			int iTagEnd = FindTagEnd ( pTokenStart );
			if ( iTagEnd!=-1 )
			{
				assert ( pTokenStart+iTagEnd<pTokenizer->GetBufferEnd() );
				AddJunk ( pTokenStart-pStartPtr, iTagEnd+1, pTokenizer->GetBoundary() ? pTokenizer->GetBoundaryOffset() : -1 );
				pTokenizer->SetBufferPtr ( pTokenStart+iTagEnd+1 );
				pLastTokenEnd = pTokenStart+iTagEnd+1; // fix it up to prevent adding last chunk on exit
				continue;
			}
		}

		SphWordID_t iWord = iWord = pDict->GetWordID ( sWord );

		pLastTokenEnd = pTokenizer->GetTokenEnd ();

		if ( pTokenizer->GetBoundary() )
			uPosition += 100; // FIXME: this should be taken from index settings

		Token_t & tLast = m_dTokens.Add();
		tLast.m_eType = iWord ? TOK_WORD : TOK_SPACE;
		tLast.m_uPosition = iWord ? ++uPosition : 0;
		tLast.m_iStart = pTokenStart - pStartPtr;
		tLast.m_iLengthBytes = pLastTokenEnd - pTokenStart;
		tLast.m_iWordID = iWord;
		tLast.m_uWords = 0;
		if ( iWord )
			m_iDocumentWords++;

		m_iLastWord = iWord ? m_dTokens.GetLength() - 1 : m_iLastWord;

		// fill word mask
		if ( bFillMasks && iWord )
		{
			bool bMatch = false;
			int iOffset;

			ARRAY_FOREACH ( nWord, m_dWords )
			{
				const char * sKeyword = &m_dKeywordsBuffer [ m_dKeywords[nWord].m_iWord ];
				const Token_t & tToken = m_dWords[nWord];

				switch ( m_dKeywords[nWord].m_uStar )
				{
				case STAR_NONE:
					bMatch = ( iWord==tToken.m_iWordID );
					break;

				case STAR_FRONT:
					iOffset = tLast.m_iLengthBytes - tToken.m_iLengthBytes;
					bMatch = ( iOffset>=0 ) &&
						( memcmp ( sKeyword, sWord + iOffset, tToken.m_iLengthBytes )==0 );
					break;

				case STAR_BACK:
					bMatch = ( tLast.m_iLengthBytes>=tToken.m_iLengthBytes ) &&
						( memcmp ( sKeyword, sWord, tToken.m_iLengthBytes )==0 );
					break;

				case STAR_BOTH:
					bMatch = strstr ( (const char *)sWord, sKeyword )!=NULL;
					break;
				}

				if ( bMatch )
				{
					tLast.m_uWords |= 1UL<<nWord;
					m_uFoundWords |= 1UL<<nWord;
				}
			}
		}
	}