void ApplyMorphology ( CSphIndex * pIndex ) { CSphVector<BYTE> dInBuffer, dOutBuffer; const int READ_BUFFER_SIZE = 1024; dInBuffer.Reserve ( READ_BUFFER_SIZE ); char sBuffer[READ_BUFFER_SIZE]; while ( !feof(stdin) ) { int iLen = fread ( sBuffer, 1, sizeof(sBuffer), stdin ); if ( !iLen ) break; int iPos = dInBuffer.GetLength(); dInBuffer.Resize ( iPos+iLen ); memcpy ( &dInBuffer[iPos], sBuffer, iLen ); } dInBuffer.Add(0); dOutBuffer.Reserve ( dInBuffer.GetLength() ); CSphScopedPtr<ISphTokenizer> pTokenizer ( pIndex->GetTokenizer()->Clone ( SPH_CLONE_INDEX ) ); CSphDict * pDict = pIndex->GetDictionary(); BYTE * sBufferToDump = &dInBuffer[0]; if ( pTokenizer.Ptr() ) { pTokenizer->SetBuffer ( &dInBuffer[0], dInBuffer.GetLength() ); while ( BYTE * sToken = pTokenizer->GetToken() ) { if ( pDict ) pDict->ApplyStemmers ( sToken ); int iPos = dOutBuffer.GetLength(); int iLen = strlen ( (char *)sToken ); sToken[iLen] = ' '; dOutBuffer.Resize ( iPos+iLen+1 ); memcpy ( &dOutBuffer[iPos], sToken, iLen+1 ); } if ( dOutBuffer.GetLength() ) dOutBuffer[dOutBuffer.GetLength()-1] = 0; else dOutBuffer.Add(0); sBufferToDump = &dOutBuffer[0]; } fprintf ( stdout, "dumping stemmed results...\n%s\n", sBufferToDump ); }
ISphTokenizer * sphConfTokenizer ( const CSphConfigSection & hIndex, CSphString & sError ) { // charset_type CSphScopedPtr<ISphTokenizer> pTokenizer ( NULL ); if ( !hIndex("charset_type") || hIndex["charset_type"]=="sbcs" ) { pTokenizer = sphCreateSBCSTokenizer (); } else if ( hIndex["charset_type"]=="utf-8" ) { pTokenizer = hIndex("ngram_chars") ? sphCreateUTF8NgramTokenizer () : sphCreateUTF8Tokenizer (); } else { sError.SetSprintf ( "unknown charset type '%s'", hIndex["charset_type"].cstr() ); return NULL; } assert ( pTokenizer.Ptr() ); // charset_table if ( hIndex("charset_table") ) if ( !pTokenizer->SetCaseFolding ( hIndex["charset_table"].cstr(), sError ) ) { sError.SetSprintf ( "'charset_table': %s", sError.cstr() ); return NULL; } // min_word_len int iMinWordLen = hIndex("min_word_len") ? Max ( hIndex["min_word_len"].intval(), 0 ) : 0; if ( iMinWordLen ) pTokenizer->SetMinWordLen ( iMinWordLen ); // ngram_chars if ( hIndex("ngram_chars") ) if ( !pTokenizer->SetNgramChars ( hIndex["ngram_chars"].cstr(), sError ) ) { sError.SetSprintf ( "'ngram_chars': %s", sError.cstr() ); return NULL; } // ngram_len int iNgramLen = hIndex("ngram_len") ? Max ( hIndex["ngram_len"].intval(), 0 ) : 0; if ( iNgramLen ) pTokenizer->SetNgramLen ( iNgramLen ); // synonyms CSphVariant * pExceptions = hIndex("exceptions"); // new option name if ( !pExceptions ) pExceptions = hIndex("synonyms"); // deprecated option name if ( pExceptions ) if ( !pTokenizer->LoadSynonyms ( pExceptions->cstr(), sError ) ) { sError.SetSprintf ( "'exceptions': %s", sError.cstr() ); return NULL; } // phrase boundaries int iBoundaryStep = hIndex("phrase_boundary_step") ? Max ( hIndex["phrase_boundary_step"].intval(), 0 ) : 0; if ( iBoundaryStep>0 && hIndex("phrase_boundary") ) if ( !pTokenizer->SetBoundary ( hIndex["phrase_boundary"].cstr(), sError ) ) { sError.SetSprintf ( "'phrase_boundary': %s", sError.cstr() ); return NULL; } // ignore_chars if ( hIndex("ignore_chars") ) if ( !pTokenizer->SetIgnoreChars ( hIndex["ignore_chars"].cstr(), sError ) ) { sError.SetSprintf ( "'ignore_chars': %s", sError.cstr() ); return NULL; } return pTokenizer.LeakPtr(); }