const FTSLanguage& FTSSpec::_getLanguageToUseV1(const BSONObj& userDoc) const {
    BSONElement e = userDoc[_languageOverrideField];
    if (e.type() == String) {
        const char* x = e.valuestrsafe();
        if (strlen(x) > 0) {
            StatusWithFTSLanguage swl = FTSLanguage::make(x, TEXT_INDEX_VERSION_1);
            dassert(swl.isOK());  // make() w/ TEXT_INDEX_VERSION_1 guaranteed to not fail.
            return *swl.getValue();
        }
    }
    return *_defaultLanguage;
}
    std::vector<std::string> tokenizeString(const char* str, const char* language) {
        StatusWithFTSLanguage swl = FTSLanguage::make(language, TEXT_INDEX_VERSION_2);
        ASSERT_OK(swl);

        std::unique_ptr<FTSTokenizer> tokenizer(swl.getValue()->createTokenizer());

        tokenizer->reset(str, FTSTokenizer::None);

        std::vector<std::string> terms;

        while (tokenizer->moveNext()) {
            terms.push_back(tokenizer->get().toString());
        }

        return terms;
    }
std::vector<std::string> tokenizeString(const char* str,
                                        const char* language,
                                        FTSTokenizer::Options options) {
    StatusWithFTSLanguage swl = FTSLanguage::make(language, TEXT_INDEX_VERSION_3);
    ASSERT_OK(swl);

    UnicodeFTSTokenizer tokenizer(swl.getValue());

    tokenizer.reset(str, options);

    std::vector<std::string> terms;

    while (tokenizer.moveNext()) {
        terms.push_back(tokenizer.get().toString());
    }

    return terms;
}
TEST(FTSLanguageV2, UpperCaseLanguage) {
    StatusWithFTSLanguage swl = FTSLanguage::make("SPANISH", TEXT_INDEX_VERSION_2);
    ASSERT(swl.getStatus().isOK());
    ASSERT_EQUALS(swl.getValue()->str(), "spanish");
}
TEST(FTSLanguageV2, ExactCode) {
    StatusWithFTSLanguage swl = FTSLanguage::make("es", TEXT_INDEX_VERSION_2);
    ASSERT(swl.getStatus().isOK());
    ASSERT_EQUALS(swl.getValue()->str(), "spanish");
}
TEST(FTSLanguageV3, Unknown) {
    StatusWithFTSLanguage swl = FTSLanguage::make("spanglish", TEXT_INDEX_VERSION_3);
    ASSERT(!swl.getStatus().isOK());
}
TEST(FTSLanguageV3, Empty) {
    StatusWithFTSLanguage swl = FTSLanguage::make("", TEXT_INDEX_VERSION_3);
    ASSERT(!swl.getStatus().isOK());
}
TEST(FTSLanguageV3, NoneLanguage) {
    StatusWithFTSLanguage swl = FTSLanguage::make("none", TEXT_INDEX_VERSION_3);
    ASSERT(swl.getStatus().isOK());
    ASSERT_EQUALS(swl.getValue()->str(), "none");
}
TEST(FTSLanguageV3, UpperCaseCode) {
    StatusWithFTSLanguage swl = FTSLanguage::make("ES", TEXT_INDEX_VERSION_3);
    ASSERT(swl.getStatus().isOK());
    ASSERT_EQUALS(swl.getValue()->str(), "spanish");
}
TEST(FTSLanguageV1, Unknown) {
    StatusWithFTSLanguage swl = FTSLanguage::make("asdf", TEXT_INDEX_VERSION_1);
    ASSERT(swl.getStatus().isOK());
    ASSERT_EQUALS(swl.getValue()->str(), "none");
}
TEST(FTSLanguageV1, CaseSensitive) {
    StatusWithFTSLanguage swl = FTSLanguage::make("SPANISH", TEXT_INDEX_VERSION_1);
    ASSERT(swl.getStatus().isOK());
    ASSERT_EQUALS(swl.getValue()->str(), "none");
}
TEST(FTSLanguageV1, StemmerOnlyLanguage2) {
    StatusWithFTSLanguage swl = FTSLanguage::make("eng", TEXT_INDEX_VERSION_1);
    ASSERT(swl.getStatus().isOK());
    ASSERT_EQUALS(swl.getValue()->str(), "eng");
}
TEST(FTSLanguageV1, DeprecatedLanguage) {
    StatusWithFTSLanguage swl = FTSLanguage::make("porter", TEXT_INDEX_VERSION_1);
    ASSERT(swl.getStatus().isOK());
    ASSERT_EQUALS(swl.getValue()->str(), "porter");
}
Example #14
0
        Status FTSQuery::parse(const string& query, StringData language,
                               TextIndexVersion textIndexVersion) {
            _search = query;
            StatusWithFTSLanguage swl = FTSLanguage::make( language, textIndexVersion );
            if ( !swl.getStatus().isOK() ) {
                return swl.getStatus();
            }
            _language = swl.getValue();

            const StopWords* stopWords = StopWords::getStopWords( *_language );
            Stemmer stemmer( *_language );

            bool inNegation = false;
            bool inPhrase = false;

            unsigned quoteOffset = 0;

            Tokenizer i( *_language, query );
            while ( i.more() ) {
                Token t = i.next();

                if ( t.type == Token::TEXT ) {
                    string s = t.data.toString();

                    if ( inPhrase && inNegation ) {
                        // don't add term
                    }
                    else {
                        _addTerm( stopWords, stemmer, s, inNegation );
                    }

                    if ( inNegation && !inPhrase )
                        inNegation = false;
                }
                else if ( t.type == Token::DELIMITER ) {
                    char c = t.data[0];
                    if ( c == '-' ) {
                        if ( !inPhrase && t.previousWhiteSpace ) {
                            // phrases can be negated, and terms not in phrases can be negated.
                            // terms in phrases can not be negated.
                            inNegation = true;
                        }
                    }
                    else if ( c == '"' ) {
                        if ( inPhrase ) {
                            // end of a phrase
                            unsigned phraseStart = quoteOffset + 1;
                            unsigned phraseLength = t.offset - phraseStart;
                            StringData phrase = StringData( query ).substr( phraseStart,
                                                                            phraseLength );
                            if ( inNegation )
                                _negatedPhrases.push_back( tolowerString( phrase ) );
                            else
                                _phrases.push_back( tolowerString( phrase ) );
                            inNegation = false;
                            inPhrase = false;
                        }
                        else {
                            // start of a phrase
                            inPhrase = true;
                            quoteOffset = t.offset;
                        }
                    }
                }
                else {
                    abort();
                }
            }

            return Status::OK();
        }
Example #15
0
Status FTSQueryImpl::parse(TextIndexVersion textIndexVersion) {
    StatusWithFTSLanguage ftsLanguage = FTSLanguage::make(getLanguage(), textIndexVersion);
    if (!ftsLanguage.getStatus().isOK()) {
        return ftsLanguage.getStatus();
    }

    // Build a space delimited list of words to have the FtsTokenizer tokenize
    string positiveTermSentence;
    string negativeTermSentence;

    bool inNegation = false;
    bool inPhrase = false;

    unsigned quoteOffset = 0;

    FTSQueryParser i(getQuery());
    while (i.more()) {
        QueryToken t = i.next();

        if (t.type == QueryToken::TEXT) {
            string s = t.data.toString();

            if (inPhrase && inNegation) {
                // don't add term
            } else {
                // A negation should only continue until the next whitespace character. For example,
                // "-foo" should negate "foo", "- foo" should not negate "foo", and "-foo-bar"
                // should negate both "foo" and "bar".
                if (inNegation && t.previousWhiteSpace) {
                    inNegation = false;
                }

                if (inNegation) {
                    negativeTermSentence.append(s);
                    negativeTermSentence.push_back(' ');
                } else {
                    positiveTermSentence.append(s);
                    positiveTermSentence.push_back(' ');
                }
            }
        } else if (t.type == QueryToken::DELIMITER) {
            char c = t.data[0];
            if (c == '-') {
                if (!inPhrase && t.previousWhiteSpace) {
                    // phrases can be negated, and terms not in phrases can be negated.
                    // terms in phrases can not be negated.
                    inNegation = true;
                }
            } else if (c == '"') {
                if (inPhrase) {
                    // end of a phrase
                    unsigned phraseStart = quoteOffset + 1;
                    unsigned phraseLength = t.offset - phraseStart;
                    StringData phrase = StringData(getQuery()).substr(phraseStart, phraseLength);
                    if (inNegation) {
                        _negatedPhrases.push_back(phrase.toString());
                    } else {
                        _positivePhrases.push_back(phrase.toString());
                    }

                    // Do not reset 'inNegation' here, since a negation should continue until the
                    // next whitespace character. For example, '-"foo bar"-"baz quux"' should negate
                    // both the phrase "foo bar" and the phrase "baz quux".

                    inPhrase = false;
                } else {
                    // start of a phrase
                    inPhrase = true;
                    // A "-" should only be treated as a negation if there is no whitespace between
                    // the "-" and the start of the phrase.
                    if (inNegation && t.previousWhiteSpace) {
                        inNegation = false;
                    }
                    quoteOffset = t.offset;
                }
            }
        } else {
            MONGO_UNREACHABLE;
        }
    }

    std::unique_ptr<FTSTokenizer> tokenizer(ftsLanguage.getValue()->createTokenizer());

    _addTerms(tokenizer.get(), positiveTermSentence, false);
    _addTerms(tokenizer.get(), negativeTermSentence, true);

    return Status::OK();
}