Beispiel #1
0
void ApplyMorphology ( CSphIndex * pIndex )
{
	CSphVector<BYTE> dInBuffer, dOutBuffer;
	const int READ_BUFFER_SIZE = 1024;
	dInBuffer.Reserve ( READ_BUFFER_SIZE );
	char sBuffer[READ_BUFFER_SIZE];
	while ( !feof(stdin) )
	{
		int iLen = fread ( sBuffer, 1, sizeof(sBuffer), stdin );
		if ( !iLen )
			break;

		int iPos = dInBuffer.GetLength();
		dInBuffer.Resize ( iPos+iLen );
		memcpy ( &dInBuffer[iPos], sBuffer, iLen );
	}
	dInBuffer.Add(0);
	dOutBuffer.Reserve ( dInBuffer.GetLength() );

	CSphScopedPtr<ISphTokenizer> pTokenizer ( pIndex->GetTokenizer()->Clone ( SPH_CLONE_INDEX ) );
	CSphDict * pDict = pIndex->GetDictionary();
	BYTE * sBufferToDump = &dInBuffer[0];
	if ( pTokenizer.Ptr() )
	{
		pTokenizer->SetBuffer ( &dInBuffer[0], dInBuffer.GetLength() );
		while ( BYTE * sToken = pTokenizer->GetToken() )
		{
			if ( pDict )
				pDict->ApplyStemmers ( sToken );

			int iPos = dOutBuffer.GetLength();
			int iLen = strlen ( (char *)sToken );
			sToken[iLen] = ' ';
			dOutBuffer.Resize ( iPos+iLen+1 );
			memcpy ( &dOutBuffer[iPos], sToken, iLen+1 );
		}

		if ( dOutBuffer.GetLength() )
			dOutBuffer[dOutBuffer.GetLength()-1] = 0;
		else
			dOutBuffer.Add(0);

		sBufferToDump = &dOutBuffer[0];
	}

	fprintf ( stdout, "dumping stemmed results...\n%s\n", sBufferToDump );
}
Beispiel #2
0
ISphTokenizer * sphConfTokenizer ( const CSphConfigSection & hIndex, CSphString & sError )
{
	// charset_type
	CSphScopedPtr<ISphTokenizer> pTokenizer ( NULL );

	if ( !hIndex("charset_type") || hIndex["charset_type"]=="sbcs" )
	{
		pTokenizer = sphCreateSBCSTokenizer ();

	} else if ( hIndex["charset_type"]=="utf-8" )
	{
		pTokenizer = hIndex("ngram_chars")
			? sphCreateUTF8NgramTokenizer ()
			: sphCreateUTF8Tokenizer ();

	} else
	{
		sError.SetSprintf ( "unknown charset type '%s'", hIndex["charset_type"].cstr() );
		return NULL;
	}

	assert ( pTokenizer.Ptr() );

	// charset_table
	if ( hIndex("charset_table") )
		if ( !pTokenizer->SetCaseFolding ( hIndex["charset_table"].cstr(), sError ) )
	{
		sError.SetSprintf ( "'charset_table': %s", sError.cstr() );
		return NULL;
	}

	// min_word_len
	int iMinWordLen = hIndex("min_word_len") ? Max ( hIndex["min_word_len"].intval(), 0 ) : 0;
	if ( iMinWordLen )
		pTokenizer->SetMinWordLen ( iMinWordLen );

	// ngram_chars
	if ( hIndex("ngram_chars") )
		if ( !pTokenizer->SetNgramChars ( hIndex["ngram_chars"].cstr(), sError ) )
	{
		sError.SetSprintf ( "'ngram_chars': %s", sError.cstr() );
		return NULL;
	}

	// ngram_len
	int iNgramLen = hIndex("ngram_len") ? Max ( hIndex["ngram_len"].intval(), 0 ) : 0;
	if ( iNgramLen )
		pTokenizer->SetNgramLen ( iNgramLen );

	// synonyms
	CSphVariant * pExceptions = hIndex("exceptions"); // new option name
	if ( !pExceptions )
		pExceptions = hIndex("synonyms"); // deprecated option name

	if ( pExceptions )
		if ( !pTokenizer->LoadSynonyms ( pExceptions->cstr(), sError ) )
	{
		sError.SetSprintf ( "'exceptions': %s", sError.cstr() );
		return NULL;
	}

	// phrase boundaries
	int iBoundaryStep = hIndex("phrase_boundary_step") ? Max ( hIndex["phrase_boundary_step"].intval(), 0 ) : 0;
	if ( iBoundaryStep>0 && hIndex("phrase_boundary") )
		if ( !pTokenizer->SetBoundary ( hIndex["phrase_boundary"].cstr(), sError ) )
	{
		sError.SetSprintf ( "'phrase_boundary': %s", sError.cstr() );
		return NULL;
	}

	// ignore_chars
	if ( hIndex("ignore_chars") )
		if ( !pTokenizer->SetIgnoreChars ( hIndex["ignore_chars"].cstr(), sError ) )
	{
		sError.SetSprintf ( "'ignore_chars': %s", sError.cstr() );
		return NULL;
	}

	return pTokenizer.LeakPtr();
}