void ExcerptGen_c::AddBoundary()
{
	Token_t & tLast = m_dTokens.Add();
	tLast.m_eType = TOK_BREAK;
	tLast.m_iStart = 0;
	tLast.m_iLengthBytes = 0;
	tLast.m_iWordID = 0;
	tLast.m_uWords = 0;
	tLast.m_uPosition = 0;
}
void ExcerptGen_c::AddJunk ( int iStart, int iLength, int iBoundary )
{
	assert ( iLength>0 );
	assert ( iLength<=m_sBuffer.Length() );
	assert ( iStart+iLength<=m_sBuffer.Length() );

	int iChunkStart = iStart;
	int iSaved = 0;

	for ( int i = iStart; i < iStart+iLength; i++ )
		if ( sphIsSpace ( m_sBuffer.cstr () [i] )!=sphIsSpace ( m_sBuffer.cstr () [iChunkStart] ) )
		{
			Token_t & tLast = m_dTokens.Add();
			tLast.m_eType = TOK_SPACE;
			tLast.m_iStart = iChunkStart;
			tLast.m_iLengthBytes = i - iChunkStart;
			tLast.m_iWordID = 0;
			tLast.m_uWords = 0;
			tLast.m_uPosition = 0;

			iChunkStart = i;
			iSaved += tLast.m_iLengthBytes;

			if ( iBoundary!=-1 && iSaved > ( iBoundary-iStart ) )
			{
				AddBoundary();
				iBoundary = -1;
			}
		}

	Token_t & tLast = m_dTokens.Add();
	tLast.m_eType = TOK_SPACE;
	tLast.m_iStart = iChunkStart;
	tLast.m_iLengthBytes = iStart + iLength - iChunkStart;
	tLast.m_iWordID = 0;
	tLast.m_uWords = 0;
	tLast.m_uPosition = 0;

	if ( iBoundary!=-1 )
		AddBoundary();
}
Пример #3
0
void ExcerptGen_c::AddJunk ( int iStart, int iLength )
{
    int iChunkStart = iStart;

    for ( int i = iStart; i < iStart+iLength; i++ )
        if ( sphIsSpace ( m_sBuffer.cstr () [i] ) != sphIsSpace ( m_sBuffer.cstr () [iChunkStart] ) )
        {
            m_dTokens.Resize ( m_dTokens.GetLength () + 1 );
            Token_t & tLast = m_dTokens.Last ();
            tLast.m_eType   = TOK_SPACE;
            tLast.m_iStart	= iChunkStart;
            tLast.m_iLengthBytes = i - iChunkStart;
            tLast.m_iWordID = 0;
            tLast.m_uWords = 0;

            iChunkStart = i;
        }

    m_dTokens.Resize ( m_dTokens.GetLength () + 1 );
    Token_t & tLast = m_dTokens.Last ();
    tLast.m_eType   = TOK_SPACE;
    tLast.m_iStart	= iChunkStart;
    tLast.m_iLengthBytes = iStart + iLength - iChunkStart;
    tLast.m_iWordID = 0;
    tLast.m_uWords = 0;
}
Пример #4
0
// copied over from sphinxutils; remove at some point
void StrSplit ( CSphVector<CSphString> & dOut, const char * sIn )
{
	if ( !sIn )
		return;

	const char * p = (char*)sIn;
	while ( *p )
	{
		// skip non-alphas
		while ( (*p) && !IsAlpha(*p) )
			p++;
		if ( !(*p) )
			break;

		// this is my next token
		assert ( IsAlpha(*p) );
		const char * sNext = p;
		while ( IsAlpha(*p) )
			p++;
		if ( sNext!=p )
			dOut.Add().SetBinary ( sNext, p-sNext );
	}
}
Пример #5
0
void ExcerptGen_c::AddJunk ( int iStart, int iLength, int iBoundary )
{
	int iChunkStart = iStart;
	int iSaved = 0;

	for ( int i = iStart; i < iStart+iLength; i++ ){
		const char* buf_ptr = NULL;
		if(m_bUtf8){
			buf_ptr = m_sBufferUTF8.cstr ();
		}else{
			buf_ptr = m_sBuffer.cstr ();
		}
		if ( sphIsSpace (  buf_ptr[i] ) != sphIsSpace ( buf_ptr[iChunkStart] ) )
		{
			m_dTokens.Resize ( m_dTokens.GetLength () + 1 );
			Token_t & tLast = m_dTokens.Last ();
			tLast.m_eType   = TOK_SPACE;
			tLast.m_iStart	= iChunkStart;
			tLast.m_iLengthBytes = i - iChunkStart;
			tLast.m_iWordID = 0;
			tLast.m_uWords = 0;

			iChunkStart = i;
			iSaved += tLast.m_iLengthBytes;

			if ( iBoundary != -1 && iSaved > iBoundary - iStart )
			{
				AddBoundary();
				iBoundary = -1;
			}
		}
	}

	m_dTokens.Resize ( m_dTokens.GetLength () + 1 );
	Token_t & tLast = m_dTokens.Last ();
	tLast.m_eType   = TOK_SPACE;
	tLast.m_iStart	= iChunkStart;
	tLast.m_iLengthBytes = iStart + iLength - iChunkStart;
	tLast.m_iWordID = 0;
	tLast.m_uWords = 0;

	if ( iBoundary != -1 ) AddBoundary();
}
Пример #6
0
char * ExcerptGen_c::BuildExcerpt ( const ExcerptQuery_t & q, CSphDict * pDict, ISphTokenizer * pTokenizer )
{
	m_dTokens.Reserve ( 1024 );
	m_sBuffer = q.m_sSource;

	const bool bUtf8 = pTokenizer->IsUtf8();
	m_bUtf8 = bUtf8;
	// tokenize query words
	int iWordsLength = strlen ( q.m_sWords.cstr() );

	CSphVector<char> dKwBuffer ( iWordsLength );
	CSphVector<Keyword_t> dKeywords;
	dKeywords.Reserve ( MAX_HIGHLIGHT_WORDS );

	BYTE * sWord;
	int iKwIndex = 0;

	pTokenizer->SetBuffer ( (BYTE*)q.m_sWords.cstr(), iWordsLength );
	while ( ( sWord = pTokenizer->GetToken() ) != NULL )
	{
		SphWordID_t iWord = pDict->GetWordID ( sWord );
		if ( iWord )
		{
			m_dWords.Resize ( m_dWords.GetLength () + 1 );
			Token_t & tLast = m_dWords.Last ();
			tLast.m_eType = TOK_WORD;
			tLast.m_iWordID = iWord;
			tLast.m_iLengthBytes = strlen ( (const char *)sWord );
			tLast.m_iLengthCP = bUtf8 ? sphUTF8Len ( (const char *)sWord ) : tLast.m_iLengthBytes;

			// store keyword
			dKeywords.Resize( dKeywords.GetLength() + 1 );
			Keyword_t & kwLast = dKeywords.Last ();

			// find stars
			bool bStarBack = *pTokenizer->GetTokenEnd() == '*';
			bool bStarFront = ( pTokenizer->GetTokenStart() != pTokenizer->GetBufferPtr() ) &&
				pTokenizer->GetTokenStart()[-1] == '*';
			kwLast.m_uStar = ( bStarFront ? STAR_FRONT : 0 ) | ( bStarBack ? STAR_BACK : 0 );

			// store token
			const int iEndIndex = iKwIndex + tLast.m_iLengthBytes + 1;
			dKwBuffer.Resize ( iEndIndex );
			kwLast.m_iWord = iKwIndex;
			strcpy ( &dKwBuffer [ iKwIndex ], (const char *)sWord );
			iKwIndex = iEndIndex;

			if ( m_dWords.GetLength() == MAX_HIGHLIGHT_WORDS )
				break;
		}
	}

	// tokenize document
	pTokenizer->SetBuffer ( (BYTE*)q.m_sSource.cstr (), strlen ( q.m_sSource.cstr () ) );

	const char * pStartPtr = pTokenizer->GetBufferPtr ();
	const char * pLastTokenEnd = pStartPtr;

	//assign utf-8
	m_sBufferUTF8 = pStartPtr;

	while ( ( sWord = pTokenizer->GetToken() ) != NULL )
	{
		const char * pTokenStart = pTokenizer->GetTokenStart ();

		if ( pTokenStart != pStartPtr )
			AddJunk ( pLastTokenEnd - pStartPtr,
					  pTokenStart - pLastTokenEnd,
					  pTokenizer->GetBoundary() ? pTokenizer->GetBoundaryOffset() : -1 );

		SphWordID_t iWord = pDict->GetWordID ( sWord );

		pLastTokenEnd = pTokenizer->GetTokenEnd ();

		m_dTokens.Resize ( m_dTokens.GetLength () + 1 );
		Token_t & tLast = m_dTokens.Last ();
		tLast.m_eType	= iWord ? TOK_WORD : TOK_SPACE;
		tLast.m_iStart  = pTokenStart - pStartPtr;
		tLast.m_iLengthBytes = pLastTokenEnd - pTokenStart;
		tLast.m_iWordID = iWord;
		tLast.m_uWords = 0;

		// fill word mask
		if ( iWord )
		{
			bool bMatch = false;
			int iOffset;

			ARRAY_FOREACH ( nWord, m_dWords )
			{
				const char * keyword = &dKwBuffer [ dKeywords[nWord].m_iWord ];
				const Token_t & token = m_dWords[nWord];

				switch ( dKeywords[nWord].m_uStar )
				{
				case STAR_NONE:
					bMatch = iWord == token.m_iWordID;
					break;

				case STAR_FRONT:
					iOffset = tLast.m_iLengthBytes - token.m_iLengthBytes;
					bMatch = (iOffset >= 0) &&
						( memcmp( keyword, sWord + iOffset, token.m_iLengthBytes ) == 0 );
					break;

				case STAR_BACK:
					bMatch = ( tLast.m_iLengthBytes >= token.m_iLengthBytes ) &&
						( memcmp( keyword, sWord, token.m_iLengthBytes ) == 0 );
					break;

				case STAR_BOTH:
					bMatch = strstr( (const char *)sWord, keyword ) != NULL;
					break;
				}

				if ( bMatch )
					tLast.m_uWords |= (1UL << nWord);
			}
		}
	}

	// last space if any
	if ( pLastTokenEnd != pTokenizer->GetBufferEnd () )
	{
		int iOffset = pTokenizer->GetBoundary() ? pTokenizer->GetBoundaryOffset() : -1;
		AddJunk ( pLastTokenEnd - pStartPtr, pTokenizer->GetBufferEnd () - pLastTokenEnd, iOffset );
	}
	
	m_dTokens.Resize ( m_dTokens.GetLength () + 1 );
	Token_t & tLast = m_dTokens.Last ();
	tLast.m_eType   = TOK_NONE;
	tLast.m_iStart  = 0;
	tLast.m_iLengthBytes = 0;
	tLast.m_iWordID = 0;
	tLast.m_uWords = 0;

	// sum token lengths
	int iSourceCodes = 0;
	ARRAY_FOREACH ( i, m_dTokens )
	{
		m_dTokens [i].m_iWeight = 0;

		if ( m_dTokens [i].m_iLengthBytes )
		{
			if ( bUtf8 )
			{
				//int iLen = sphUTF8Len ( m_sBuffer.SubString ( m_dTokens[i].m_iStart, m_dTokens[i].m_iLengthBytes ).cstr() );
				int iLen = sphUTF8Len ( m_sBufferUTF8.SubString ( m_dTokens[i].m_iStart, m_dTokens[i].m_iLengthBytes ).cstr() );
				m_dTokens[i].m_iLengthCP = iLen;
			}
			else
				m_dTokens[i].m_iLengthCP = m_dTokens[i].m_iLengthBytes;
			iSourceCodes += m_dTokens[i].m_iLengthCP;
		}
		else
			m_dTokens [i].m_iLengthCP = 0;
	}
Пример #7
0
bool CSphConfigParser::TryToExec ( char * pBuffer, char * pEnd, const char * szFilename, CSphVector<char> & dResult )
{
	int dPipe[2] = { -1, -1 };

	if ( pipe ( dPipe ) )
	{
		snprintf ( m_sError, sizeof ( m_sError ), "pipe() failed (error=%s)", strerror(errno) );
		return false;
	}

	pBuffer = trim ( pBuffer );

	int iRead  = dPipe [0];
	int iWrite = dPipe [1];

	signal ( SIGCHLD, sigchld );

	int iChild = fork();

	if ( iChild == 0 )
	{
		close ( iRead );
		close ( STDOUT_FILENO );
		dup2 ( iWrite, STDOUT_FILENO );

		char * pPtr = pBuffer;
		char * pArgs = NULL;
		while ( *pPtr )
		{
			if ( sphIsSpace ( *pPtr ) )
			{
				*pPtr = '\0';
				pArgs = trim ( pPtr+1 );
				break;
			}

			pPtr++;
		}
		
		if ( pArgs )
			execl ( pBuffer, pBuffer, pArgs, szFilename, NULL );
		else
			execl ( pBuffer, pBuffer, szFilename, NULL );

		exit ( 1 );
	}
	else
		if ( iChild == -1 )
		{
			snprintf ( m_sError, sizeof ( m_sError ), "fork failed (error=%s)", strerror(errno) );
			return false;
		}

	close ( iWrite );

	int iBytesRead, iTotalRead = 0;
	const int BUFFER_SIZE = 65536;
	
	dResult.Reset ();

	do
	{
		dResult.Resize ( iTotalRead + BUFFER_SIZE );
		iBytesRead = read ( iRead, (void*)&(dResult [iTotalRead]), BUFFER_SIZE );
		iTotalRead += iBytesRead;
	}
	while ( iBytesRead > 0 );

	int iStatus;
	wait ( &iStatus );
	iStatus = (signed char) WEXITSTATUS (iStatus);

	if ( iStatus )
	{
		snprintf ( m_sError, sizeof ( m_sError ), "error executing '%s'", pBuffer );
		return false;
	}

	if ( iBytesRead < 0  )
	{
		snprintf ( m_sError, sizeof ( m_sError ), "pipe read error (error=%s)", strerror(errno) );
		return false;
	}

	dResult.Resize ( iTotalRead + 1 );
	dResult [iTotalRead] = '\0';

	return true;
}
Пример #8
0
void UrlBreak ( Split_t & tBest, const char * sWord )
{
	const int iLen = strlen(sWord);

	tBest.m_Pos.Resize(0);

	// current partial splits
	// begin with an empty one
	CSphVector<Split_t> dSplits;
	dSplits.Add();

	// our best guess so far
	// begin with a trivial baseline one (ie. no splits at all)
	Prob_t p = g_LM.GetProb ( sWord, iLen );
	tBest.m_Pos.Add ( iLen );
	tBest.m_fProb = p.m_fProb;
	tBest.m_bAllDict = tBest.m_bAnyDict = p.m_bDict;

	if ( iLen>=DICT_COMPOUND_MIN && tBest.m_bAllDict )
	{
		static const float THRESH = logf ( DICT_COMPOUND_THRESH );
		if ( tBest.m_fProb<=THRESH )
			tBest.m_fProb *= DICT_COMPOUND_COEFF;
	}

	// work the current splits
	CSphVector<Split_t> dSplits2;
	while ( dSplits.GetLength() )
	{
		int iWorkedSplits = 0;
		float fPrevBest = tBest.m_fProb;

		ARRAY_FOREACH ( iSplit, dSplits )
		{
			Split_t & s = dSplits[iSplit];

			// filter out splits that were added before (!) a new best guess on the previous iteration
			if ( dSplits[iSplit] < tBest )
				continue;
			iWorkedSplits++;

			int iLast = 0;
			if ( s.m_Pos.GetLength() )
				iLast = s.m_Pos.Last();

			for ( int i=1+iLast; i<iLen; i++ )
			{
				// consider a split at position i
				// it generates a word candidate [iLast,i) and a tail [i,iLen)
				// let's score those
				Prob_t tCand = g_LM.GetProb ( sWord+iLast, i-iLast );
				Prob_t tTail = g_LM.GetProb ( sWord+i, iLen-i );

				// if the current best is all-keywords, the new candidates must be, too
				if ( tBest.m_bAllDict && !tCand.m_bDict )
					continue;

				// compute partial and full split candidates generated by the current guess
				Split_t tPartial = s;
				tPartial.AddSplitPos ( tCand, i );

				Split_t tFull = tPartial;
				tFull.AddSplitPos ( tTail, iLen );

				// check if the full one is our new best full one
				bool bNewBest = false;
				if ( tBest < tFull )
				{
					// FIXME? we do this even when the new split is *not* all-keywords,
					// but the old best split was; is this ever a problem?
					tBest = tFull;
//					tBest.Dump ( sWord, "new-best" );
					bNewBest = true;
				}

				// check if the resulting partial split is worth scanning further
				if ( tBest < tPartial )
				{
					dSplits2.Add ( tPartial );
//					dSplits2.Last().Dump ( sWord, "scan-partial" );
				}
			}
		}

		// damage control!
		// if we just processed over 100K candidate splits and got no improvement
		// lets assume that our chances of getting one are kinda low and bail
		if ( iWorkedSplits>=100000 && tBest.m_fProb>=fPrevBest )
			break;

		// keep going
		dSplits.SwapData ( dSplits2 );
		dSplits2.Resize ( 0 );
	}
Пример #9
0
int main ( int iArgs, char ** dArgs )
{
	OutputMode_e eMode = M_DEFAULT;
	bool bUseCustomCharset = false;
	CSphString sDict, sAffix, sLocale, sCharsetFile, sResult = "result.txt";

	printf ( "spelldump, an ispell dictionary dumper\n\n" );

	int i = 1;
	for ( ; i < iArgs; i++ )
	{
		if ( !strcmp ( dArgs[i], "-c" ) )
		{
			if ( ++i==iArgs ) break;
			bUseCustomCharset = true;
			sCharsetFile = dArgs[i];

		} else if ( !strcmp ( dArgs[i], "-m" ) )
		{
			if ( ++i==iArgs ) break;
			char * sMode = dArgs[i];

			if ( !strcmp ( sMode, "debug" ) )		{ eMode = M_DEBUG; continue; }
			if ( !strcmp ( sMode, "duplicates" ) )	{ eMode = M_DUPLICATES; continue; }
			if ( !strcmp ( sMode, "last" ) )		{ eMode = M_LAST; continue; }
			if ( !strcmp ( sMode, "default" ) )		{ eMode = M_DEFAULT; continue; }

			printf ( "Unrecognized mode: %s\n", sMode );
			return 1;

		} else
			break;
	}

	switch ( iArgs - i )
	{
		case 4:
			sLocale = dArgs[i + 3];
		case 3:
			sResult = dArgs[i + 2];
		case 2:
			sAffix = dArgs[i + 1];
			sDict = dArgs[i];
			break;
		default:
			printf ( "Usage: spelldump [options] <dictionary> <affix> [result] [locale-name]\n\n"
				"Options:\n"
				"-c <file>\tuse case convertion defined in <file>\n"
				"-m <mode>\toutput (conflict resolution) mode:\n"
				"\t\tdefault - try to guess the best way to resolve a conflict\n"
				"\t\tlast - choose last entry\n"
				"\t\tdebug - dump all mappings (with rules)\n"
				"\t\tduplicates - dump duplicate mappings only (with rules)\n" );
			if ( iArgs>1 )
			{
				printf ( "\n"
					"Examples:\n"
					"spelldump en.dict en.aff\n"
					"spelldump ru.dict ru.aff ru.txt ru_RU.CP1251\n"
					"spelldump ru.dict ru.aff ru.txt .1251\n" );
			}
			return 1;
	}

	printf ( "Loading dictionary...\n" );
	CISpellDict Dict;
	if ( !Dict.Load ( sDict.cstr () ) )
		sphDie ( "Error loading dictionary file '%s'\n", sDict.IsEmpty () ? "" : sDict.cstr () );

	printf ( "Loading affix file...\n" );
	CISpellAffix Affix ( sLocale.cstr (), bUseCustomCharset ? sCharsetFile.cstr () : NULL );
	if ( !Affix.Load ( sAffix.cstr () ) )
		sphDie ( "Error loading affix file '%s'\n", sAffix.IsEmpty () ? "" : sAffix.cstr () );

	if ( sResult.IsEmpty () )
		sphDie ( "No result file specified\n" );

	FILE * pFile = fopen ( sResult.cstr (), "wt" );
	if ( !pFile )
		sphDie ( "Unable to open '%s' for writing\n", sResult.cstr () );

	if ( eMode!=M_DEFAULT )
		printf ( "Output mode: %s\n", dModeName[eMode] );

	Dict.IterateStart ();
	WordMap_t tWordMap;
	const CISpellDict::CISpellDictWord * pWord = NULL;
	int nDone = 0;
	while ( ( pWord = Dict.IterateNext () )!=NULL )
	{
		EmitResult ( tWordMap, pWord->m_sWord, pWord->m_sWord );

		if ( ( ++nDone % 10 )==0 )
		{
			printf ( "\rDictionary words processed: %d", nDone );
			fflush ( stdout );
		}

		if ( pWord->m_sFlags.IsEmpty() )
			continue;

		CSphString sWord, sWordForCross;
		int iFlagLen = strlen ( pWord->m_sFlags.cstr () );
		for ( int iFlag1 = 0; iFlag1 < iFlagLen; ++iFlag1 )
			for ( int iRule1 = 0; iRule1 < Affix.GetNumRules (); ++iRule1 )
			{
				CISpellAffixRule * pRule1 = Affix.GetRule ( iRule1 );
				if ( pRule1->Flag()!=pWord->m_sFlags.cstr()[iFlag1] )
					continue;

				sWord = pWord->m_sWord;

				if ( !pRule1->Apply ( sWord ) )
					continue;

				EmitResult ( tWordMap, sWord, pWord->m_sWord, pRule1->Flag() );

				// apply other rules
				if ( !Affix.CheckCrosses() )
					continue;

				if ( !pRule1->IsCrossProduct() )
					continue;

				for ( int iFlag2 = iFlag1 + 1; iFlag2 < iFlagLen; ++iFlag2 )
					for ( int iRule2 = 0; iRule2 < Affix.GetNumRules (); ++iRule2 )
					{
						CISpellAffixRule * pRule2 = Affix.GetRule ( iRule2 );
						if ( !pRule2->IsCrossProduct () || pRule2->Flag()!=pWord->m_sFlags.cstr()[iFlag2] ||
							pRule2->IsPrefix()==pRule1->IsPrefix() )
							continue;

						sWordForCross = sWord;
						if ( pRule2->Apply ( sWordForCross ) )
							EmitResult ( tWordMap, sWordForCross, pWord->m_sWord, pRule1->Flag(), pRule2->Flag() );
					}
			}
	}
	printf ( "\rDictionary words processed: %d\n", nDone );

	// output

	CSphVector<const char *> dKeys;
	tWordMap.IterateStart();
	while ( tWordMap.IterateNext() )
		dKeys.Add ( tWordMap.IterateGetKey().cstr() );
	dKeys.Sort ( WordLess() );

	ARRAY_FOREACH ( iKey, dKeys )
	{
		const CSphVector<MapInfo_t> & dWords = tWordMap[dKeys[iKey]];
		const char * sKey = dKeys[iKey];

		switch ( eMode )
		{
			case M_LAST:
				fprintf ( pFile, "%s > %s\n", sKey, dWords.Last().m_sWord.cstr() );
				break;

			case M_EXACT_OR_LONGEST:
			{
				int iMatch = 0;
				int iLength = 0;

				ARRAY_FOREACH ( i, dWords )
				{
					if ( dWords[i].m_sWord==sKey )
					{
						iMatch = i;
						break;
					}

					int iWordLength = strlen ( dWords[i].m_sWord.cstr() );
					if ( iWordLength>iLength )
					{
						iLength = iWordLength;
						iMatch = i;
					}
				}

				fprintf ( pFile, "%s > %s\n", sKey, dWords[iMatch].m_sWord.cstr() );
				break;
			}

			case M_DUPLICATES:
				if ( dWords.GetLength()==1 ) break;
			case M_DEBUG:
				ARRAY_FOREACH ( i, dWords )
					fprintf ( pFile, "%s > %s %s/%d\n", sKey, dWords[i].m_sWord.cstr(),
						dWords[i].m_sRules, dWords.GetLength() );
				break;
		}
	}

	fclose ( pFile );

	return 0;
}
Пример #10
0
int CISpellAffix::GetNumRules () const
{
	return m_dRules.GetLength ();
}
Пример #11
0
bool CISpellAffix::LoadMySpell ( FILE * pFile )
{
	char sBuffer	[MAX_STR_LENGTH];
	char sCondition	[MAX_STR_LENGTH];
	char sRemove	[MAX_STR_LENGTH];
	char sAppend	[MAX_STR_LENGTH];

	RuleType_e eRule = RULE_NONE;
	BYTE cFlag = 0;
	BYTE cCombine = 0;
	int iCount = 0, iLine = 0;
	const char * sMode = 0;

	while ( !feof ( pFile ) )
	{
		char * sLine = fgets ( sBuffer, MAX_STR_LENGTH, pFile );
		if ( !sLine )
			break;
		++iLine;

		// prefix and suffix rules
		RuleType_e eNewRule = RULE_NONE;
		if ( !strncmp ( sLine, "PFX", 3 ) )
		{
			eNewRule = RULE_PREFIXES;
			sMode = "prefix";

		} else if ( !strncmp ( sLine, "SFX", 3 ) )
		{
			eNewRule = RULE_SUFFIXES;
			sMode = "suffix";
		}

		if ( eNewRule!=RULE_NONE )
		{
			sLine += 3;
			while ( *sLine && isspace ( (unsigned char) *sLine ) )
				++sLine;

			if ( eNewRule!=eRule ) // new rule header
			{
				if ( iCount )
					printf ( "WARNING: Line %d: Premature end of entries.\n", iLine );

				if ( sscanf ( sLine, "%c %c %d", &cFlag, &cCombine, &iCount )!=3 ) // NOLINT
					printf ( "WARNING; Line %d: Malformed %s header\n", iLine, sMode );

				eRule = eNewRule;

			} else // current rule continued
			{
				*sRemove = *sAppend = 0;
				char cNewFlag;
				if ( sscanf ( sLine, "%c %s %s %s", &cNewFlag, sRemove, sAppend, sCondition )==4 ) // NOLINT
				{
					if ( cNewFlag!=cFlag )
						printf ( "WARNING: Line %d: Flag character mismatch\n", iLine );

					if ( *sRemove=='0' && *(sRemove + 1)==0 ) *sRemove = 0;
					if ( *sAppend=='0' && *(sAppend + 1)==0 ) *sAppend = 0;

					CISpellAffixRule Rule ( eRule, cFlag, cCombine=='Y', sCondition, sRemove, sAppend );
					m_dRules.Add ( Rule );

				} else
					printf ( "WARNING: Line %d: Malformed %s rule\n", iLine, sMode );

				if ( !--iCount ) eRule = RULE_NONE;
			}
			continue;
		}
	}
	return true;
}
Пример #12
0
bool CISpellAffix::LoadISpell ( FILE * pFile )
{
	char szBuffer [ MAX_STR_LENGTH ];
	char szCondition [ MAX_STR_LENGTH ];
	char szStrip [ MAX_STR_LENGTH ];
	char szAppend [ MAX_STR_LENGTH ];

	RuleType_e eRule = RULE_NONE;
	char cFlag = '\0';
	bool bCrossProduct = false;
	int iLine = 0;

	// TODO: parse all .aff character replacement commands
	while ( !feof ( pFile ) )
	{
		char * szResult = fgets ( szBuffer, MAX_STR_LENGTH, pFile );
		if ( !szResult )
			break;
		iLine++;

		if ( !strncasecmp ( szBuffer, "prefixes", 8 ) )
		{
			eRule = RULE_PREFIXES;
			continue;
		}

		if ( !strncasecmp ( szBuffer, "suffixes", 8 ) )
		{
			eRule = RULE_SUFFIXES;
			continue;
		}

		if ( !strncasecmp ( szBuffer, "wordchars", 9 ) )
		{
			char * szStart = szBuffer + 9;
			while ( *szStart && isspace ( (unsigned char) *szStart ) )
				++szStart;

			char * szRangeL = szStart;
			while ( *szStart && !isspace ( (unsigned char) *szStart ) )
				++szStart;

			if ( !*szStart )
			{
				printf ( "WARNING: Line %d: invalid 'wordchars' statement\n", iLine );
				continue;
			}

			*szStart = '\0';
			++szStart;

			while ( *szStart && isspace ( (unsigned char) *szStart ) )
				++szStart;

			char * szRangeU = szStart;

			while ( *szStart && !isspace ( (unsigned char) *szStart ) )
				++szStart;

			*szStart = '\0';

			if ( !AddToCharset ( szRangeL, szRangeU ) )
				printf ( "WARNING: Line %d: cannot add to charset: '%s' '%s'\n", iLine, szRangeL, szRangeU );

			continue;
		}

		if ( !strncasecmp ( szBuffer, "flag", 4 ) )
		{
			if ( eRule==RULE_NONE )
			{
				printf ( "WARNING: Line %d: 'flag' appears before preffixes or suffixes\n", iLine );
				continue;
			}

			char * szStart = szBuffer + 4;
			while ( *szStart && isspace ( (unsigned char) *szStart ) )
				++szStart;

			bCrossProduct = ( *szStart=='*' );

			cFlag = bCrossProduct ? *(szStart + 1) : *(szStart);
			continue;
		}

		if ( eRule==RULE_NONE )
			continue;

		char * szComment = strchr ( szBuffer, '#' );
		if ( szComment )
			*szComment = '\0';

		if ( !* szBuffer )
			continue;

		szCondition[0] = '\0';
		szStrip[0] = '\0';
		szAppend[0] = '\0';

		int nFields = sscanf ( szBuffer, "%[^>\n]>%[^,\n],%[^\n]", szCondition, szStrip, szAppend ); // NOLINT

		Strip ( szCondition );
		Strip ( szStrip );
		Strip ( szAppend );

		switch ( nFields )
		{
		case 2: // no optional strip-string
			strcpy ( szAppend, szStrip ); // NOLINT
			szStrip[0] = '\0';
			break;
		case 3:	// all read
			break;
		default: // invalid repl
			continue;
		}

		CISpellAffixRule Rule ( eRule, cFlag, bCrossProduct, szCondition, szStrip, szAppend );
		m_dRules.Add ( Rule );
	}

	return true;
}
Пример #13
0
void ExcerptGen_c::TokenizeDocument ( char * pData, CSphDict * pDict, ISphTokenizer * pTokenizer, bool bFillMasks, bool bRetainHtml )
{
	m_iDocumentWords = 0;
	m_dTokens.Reserve ( 1024 );
	m_sBuffer = pData;

	pTokenizer->SetBuffer ( (BYTE*)pData, strlen(pData) );

	const char * pStartPtr = pTokenizer->GetBufferPtr ();
	const char * pLastTokenEnd = pStartPtr;

	if ( bRetainHtml )
		pTokenizer->AddSpecials ( "<" );

	BYTE * sWord;
	DWORD uPosition = 0; // hit position in document
	while ( ( sWord = pTokenizer->GetToken() )!=NULL )
	{
		if ( pTokenizer->TokenIsBlended() )
			continue;

		const char * pTokenStart = pTokenizer->GetTokenStart ();

		if ( pTokenStart!=pStartPtr && pTokenStart>pLastTokenEnd )
			AddJunk ( pLastTokenEnd - pStartPtr,
				pTokenStart - pLastTokenEnd,
				pTokenizer->GetBoundary() ? pTokenizer->GetBoundaryOffset() : -1 );

		if ( bRetainHtml && *pTokenStart=='<' )
		{
			int iTagEnd = FindTagEnd ( pTokenStart );
			if ( iTagEnd!=-1 )
			{
				assert ( pTokenStart+iTagEnd<pTokenizer->GetBufferEnd() );
				AddJunk ( pTokenStart-pStartPtr, iTagEnd+1, pTokenizer->GetBoundary() ? pTokenizer->GetBoundaryOffset() : -1 );
				pTokenizer->SetBufferPtr ( pTokenStart+iTagEnd+1 );
				pLastTokenEnd = pTokenStart+iTagEnd+1; // fix it up to prevent adding last chunk on exit
				continue;
			}
		}

		SphWordID_t iWord = iWord = pDict->GetWordID ( sWord );

		pLastTokenEnd = pTokenizer->GetTokenEnd ();

		if ( pTokenizer->GetBoundary() )
			uPosition += 100; // FIXME: this should be taken from index settings

		Token_t & tLast = m_dTokens.Add();
		tLast.m_eType = iWord ? TOK_WORD : TOK_SPACE;
		tLast.m_uPosition = iWord ? ++uPosition : 0;
		tLast.m_iStart = pTokenStart - pStartPtr;
		tLast.m_iLengthBytes = pLastTokenEnd - pTokenStart;
		tLast.m_iWordID = iWord;
		tLast.m_uWords = 0;
		if ( iWord )
			m_iDocumentWords++;

		m_iLastWord = iWord ? m_dTokens.GetLength() - 1 : m_iLastWord;

		// fill word mask
		if ( bFillMasks && iWord )
		{
			bool bMatch = false;
			int iOffset;

			ARRAY_FOREACH ( nWord, m_dWords )
			{
				const char * sKeyword = &m_dKeywordsBuffer [ m_dKeywords[nWord].m_iWord ];
				const Token_t & tToken = m_dWords[nWord];

				switch ( m_dKeywords[nWord].m_uStar )
				{
				case STAR_NONE:
					bMatch = ( iWord==tToken.m_iWordID );
					break;

				case STAR_FRONT:
					iOffset = tLast.m_iLengthBytes - tToken.m_iLengthBytes;
					bMatch = ( iOffset>=0 ) &&
						( memcmp ( sKeyword, sWord + iOffset, tToken.m_iLengthBytes )==0 );
					break;

				case STAR_BACK:
					bMatch = ( tLast.m_iLengthBytes>=tToken.m_iLengthBytes ) &&
						( memcmp ( sKeyword, sWord, tToken.m_iLengthBytes )==0 );
					break;

				case STAR_BOTH:
					bMatch = strstr ( (const char *)sWord, sKeyword )!=NULL;
					break;
				}

				if ( bMatch )
				{
					tLast.m_uWords |= 1UL<<nWord;
					m_uFoundWords |= 1UL<<nWord;
				}
			}
		}
	}
Пример #14
0
bool sphPluginReload ( const char * sName, CSphString & sError )
{
#if !HAVE_DLOPEN
	sError = "no dlopen(), no plugins";
	return false;
#else
	// find all plugins from the given library
	CSphScopedLock<CSphMutex> tLock ( g_tPluginMutex );

	CSphVector<PluginKey_t> dKeys;
	CSphVector<PluginDesc_c*> dPlugins;

	g_hPlugins.IterateStart();
	while ( g_hPlugins.IterateNext() )
	{
		PluginDesc_c * v = g_hPlugins.IterateGet();
		if ( v->GetLibName()==sName )
		{
			dKeys.Add ( g_hPlugins.IterateGetKey() );
			dPlugins.Add ( g_hPlugins.IterateGet() );
		}
	}

	// no plugins loaded? oops
	if ( dPlugins.GetLength()==0 )
	{
		sError.SetSprintf ( "no active plugins loaded from %s", sName );
		return false;
	}

	// load new library and check every plugin
#if !USE_WINDOWS
	PluginLib_c * pNewLib = LoadPluginLibrary ( sName, sError, true );
#else
	PluginLib_c * pNewLib = LoadPluginLibrary ( sName, sError );
#endif
	if ( !pNewLib )
		return false;

	// load all plugins
	CSphVector<PluginDesc_c*> dNewPlugins;
	ARRAY_FOREACH ( i, dPlugins )
	{
		PluginDesc_c * pDesc = NULL;
		const SymbolDesc_t * pSym = NULL;
		switch ( dKeys[i].m_eType )
		{
			case PLUGIN_RANKER:					pDesc = new PluginRanker_c ( pNewLib ); pSym = g_dSymbolsRanker; break;
			case PLUGIN_INDEX_TOKEN_FILTER:		pDesc = new PluginTokenFilter_c ( pNewLib ); pSym = g_dSymbolsTokenFilter; break;
			case PLUGIN_QUERY_TOKEN_FILTER:		pDesc = new PluginQueryTokenFilter_c ( pNewLib ); pSym = g_dSymbolsQueryTokenFilter; break;
			case PLUGIN_FUNCTION:				pDesc = new PluginUDF_c ( pNewLib, dPlugins[i]->GetUdfRetType() ); pSym = g_dSymbolsUDF; break;
			default:
				sphDie ( "INTERNAL ERROR: unknown plugin type %d in sphPluginReload()", (int)dKeys[i].m_eType );
				return false;
		}

		if ( !PluginLoadSymbols ( pDesc, pSym, pNewLib->GetHandle(), dKeys[i].m_sName.cstr(), sError ) )
		{
			pDesc->Release();
			break;
		}

		dNewPlugins.Add ( pDesc );
	}