const FTSLanguage& FTSSpec::_getLanguageToUseV1(const BSONObj& userDoc) const { BSONElement e = userDoc[_languageOverrideField]; if (e.type() == String) { const char* x = e.valuestrsafe(); if (strlen(x) > 0) { StatusWithFTSLanguage swl = FTSLanguage::make(x, TEXT_INDEX_VERSION_1); dassert(swl.isOK()); // make() w/ TEXT_INDEX_VERSION_1 guaranteed to not fail. return *swl.getValue(); } } return *_defaultLanguage; }
std::vector<std::string> tokenizeString(const char* str, const char* language) { StatusWithFTSLanguage swl = FTSLanguage::make(language, TEXT_INDEX_VERSION_2); ASSERT_OK(swl); std::unique_ptr<FTSTokenizer> tokenizer(swl.getValue()->createTokenizer()); tokenizer->reset(str, FTSTokenizer::None); std::vector<std::string> terms; while (tokenizer->moveNext()) { terms.push_back(tokenizer->get().toString()); } return terms; }
std::vector<std::string> tokenizeString(const char* str, const char* language, FTSTokenizer::Options options) { StatusWithFTSLanguage swl = FTSLanguage::make(language, TEXT_INDEX_VERSION_3); ASSERT_OK(swl); UnicodeFTSTokenizer tokenizer(swl.getValue()); tokenizer.reset(str, options); std::vector<std::string> terms; while (tokenizer.moveNext()) { terms.push_back(tokenizer.get().toString()); } return terms; }
TEST(FTSLanguageV2, UpperCaseLanguage) { StatusWithFTSLanguage swl = FTSLanguage::make("SPANISH", TEXT_INDEX_VERSION_2); ASSERT(swl.getStatus().isOK()); ASSERT_EQUALS(swl.getValue()->str(), "spanish"); }
TEST(FTSLanguageV2, ExactCode) { StatusWithFTSLanguage swl = FTSLanguage::make("es", TEXT_INDEX_VERSION_2); ASSERT(swl.getStatus().isOK()); ASSERT_EQUALS(swl.getValue()->str(), "spanish"); }
TEST(FTSLanguageV3, Unknown) { StatusWithFTSLanguage swl = FTSLanguage::make("spanglish", TEXT_INDEX_VERSION_3); ASSERT(!swl.getStatus().isOK()); }
TEST(FTSLanguageV3, Empty) { StatusWithFTSLanguage swl = FTSLanguage::make("", TEXT_INDEX_VERSION_3); ASSERT(!swl.getStatus().isOK()); }
TEST(FTSLanguageV3, NoneLanguage) { StatusWithFTSLanguage swl = FTSLanguage::make("none", TEXT_INDEX_VERSION_3); ASSERT(swl.getStatus().isOK()); ASSERT_EQUALS(swl.getValue()->str(), "none"); }
TEST(FTSLanguageV3, UpperCaseCode) { StatusWithFTSLanguage swl = FTSLanguage::make("ES", TEXT_INDEX_VERSION_3); ASSERT(swl.getStatus().isOK()); ASSERT_EQUALS(swl.getValue()->str(), "spanish"); }
TEST(FTSLanguageV1, Unknown) { StatusWithFTSLanguage swl = FTSLanguage::make("asdf", TEXT_INDEX_VERSION_1); ASSERT(swl.getStatus().isOK()); ASSERT_EQUALS(swl.getValue()->str(), "none"); }
TEST(FTSLanguageV1, CaseSensitive) { StatusWithFTSLanguage swl = FTSLanguage::make("SPANISH", TEXT_INDEX_VERSION_1); ASSERT(swl.getStatus().isOK()); ASSERT_EQUALS(swl.getValue()->str(), "none"); }
TEST(FTSLanguageV1, StemmerOnlyLanguage2) { StatusWithFTSLanguage swl = FTSLanguage::make("eng", TEXT_INDEX_VERSION_1); ASSERT(swl.getStatus().isOK()); ASSERT_EQUALS(swl.getValue()->str(), "eng"); }
TEST(FTSLanguageV1, DeprecatedLanguage) { StatusWithFTSLanguage swl = FTSLanguage::make("porter", TEXT_INDEX_VERSION_1); ASSERT(swl.getStatus().isOK()); ASSERT_EQUALS(swl.getValue()->str(), "porter"); }
Status FTSQuery::parse(const string& query, StringData language, TextIndexVersion textIndexVersion) { _search = query; StatusWithFTSLanguage swl = FTSLanguage::make( language, textIndexVersion ); if ( !swl.getStatus().isOK() ) { return swl.getStatus(); } _language = swl.getValue(); const StopWords* stopWords = StopWords::getStopWords( *_language ); Stemmer stemmer( *_language ); bool inNegation = false; bool inPhrase = false; unsigned quoteOffset = 0; Tokenizer i( *_language, query ); while ( i.more() ) { Token t = i.next(); if ( t.type == Token::TEXT ) { string s = t.data.toString(); if ( inPhrase && inNegation ) { // don't add term } else { _addTerm( stopWords, stemmer, s, inNegation ); } if ( inNegation && !inPhrase ) inNegation = false; } else if ( t.type == Token::DELIMITER ) { char c = t.data[0]; if ( c == '-' ) { if ( !inPhrase && t.previousWhiteSpace ) { // phrases can be negated, and terms not in phrases can be negated. // terms in phrases can not be negated. inNegation = true; } } else if ( c == '"' ) { if ( inPhrase ) { // end of a phrase unsigned phraseStart = quoteOffset + 1; unsigned phraseLength = t.offset - phraseStart; StringData phrase = StringData( query ).substr( phraseStart, phraseLength ); if ( inNegation ) _negatedPhrases.push_back( tolowerString( phrase ) ); else _phrases.push_back( tolowerString( phrase ) ); inNegation = false; inPhrase = false; } else { // start of a phrase inPhrase = true; quoteOffset = t.offset; } } } else { abort(); } } return Status::OK(); }
Status FTSQueryImpl::parse(TextIndexVersion textIndexVersion) { StatusWithFTSLanguage ftsLanguage = FTSLanguage::make(getLanguage(), textIndexVersion); if (!ftsLanguage.getStatus().isOK()) { return ftsLanguage.getStatus(); } // Build a space delimited list of words to have the FtsTokenizer tokenize string positiveTermSentence; string negativeTermSentence; bool inNegation = false; bool inPhrase = false; unsigned quoteOffset = 0; FTSQueryParser i(getQuery()); while (i.more()) { QueryToken t = i.next(); if (t.type == QueryToken::TEXT) { string s = t.data.toString(); if (inPhrase && inNegation) { // don't add term } else { // A negation should only continue until the next whitespace character. For example, // "-foo" should negate "foo", "- foo" should not negate "foo", and "-foo-bar" // should negate both "foo" and "bar". if (inNegation && t.previousWhiteSpace) { inNegation = false; } if (inNegation) { negativeTermSentence.append(s); negativeTermSentence.push_back(' '); } else { positiveTermSentence.append(s); positiveTermSentence.push_back(' '); } } } else if (t.type == QueryToken::DELIMITER) { char c = t.data[0]; if (c == '-') { if (!inPhrase && t.previousWhiteSpace) { // phrases can be negated, and terms not in phrases can be negated. // terms in phrases can not be negated. inNegation = true; } } else if (c == '"') { if (inPhrase) { // end of a phrase unsigned phraseStart = quoteOffset + 1; unsigned phraseLength = t.offset - phraseStart; StringData phrase = StringData(getQuery()).substr(phraseStart, phraseLength); if (inNegation) { _negatedPhrases.push_back(phrase.toString()); } else { _positivePhrases.push_back(phrase.toString()); } // Do not reset 'inNegation' here, since a negation should continue until the // next whitespace character. For example, '-"foo bar"-"baz quux"' should negate // both the phrase "foo bar" and the phrase "baz quux". inPhrase = false; } else { // start of a phrase inPhrase = true; // A "-" should only be treated as a negation if there is no whitespace between // the "-" and the start of the phrase. if (inNegation && t.previousWhiteSpace) { inNegation = false; } quoteOffset = t.offset; } } } else { MONGO_UNREACHABLE; } } std::unique_ptr<FTSTokenizer> tokenizer(ftsLanguage.getValue()->createTokenizer()); _addTerms(tokenizer.get(), positiveTermSentence, false); _addTerms(tokenizer.get(), negativeTermSentence, true); return Status::OK(); }