Ejemplo n.º 1
0
Language Language::TryParseWithValidation(const std::wstring& s)
{
    Language lang = Language::TryParse(s);
    if (!lang.IsValid())
        return Language(); // invalid

    if (!IsISOLanguage(lang.Lang()))
        return Language(); // invalid

    auto country = lang.Country();
    if (!country.empty() && !IsISOCountry(country))
        return Language(); // invalid

    return lang;
}
Ejemplo n.º 2
0
SuggestionsList TranslationMemoryImpl::Search(const Language& srclang,
                                              const Language& lang,
                                              const std::wstring& source)
{
    try
    {
        // TODO: query by srclang too!
        auto srclangQ = newLucene<TermQuery>(newLucene<Term>(L"srclang", srclang.WCode()));

        const Lucene::String fullLang = lang.WCode();
        const Lucene::String shortLang = StringUtils::toUnicode(lang.Lang());

        QueryPtr langPrimary = newLucene<TermQuery>(newLucene<Term>(L"lang", fullLang));
        QueryPtr langSecondary;
        if (fullLang == shortLang)
        {
            // for e.g. 'cs', search also 'cs_*' (e.g. 'cs_CZ')
            langSecondary = newLucene<PrefixQuery>(newLucene<Term>(L"lang", shortLang + L"_"));
        }
        else
        {
            // search short variants of the language too
            langSecondary = newLucene<TermQuery>(newLucene<Term>(L"lang", shortLang));
        }
        langSecondary->setBoost(0.85);
        auto langQ = newLucene<BooleanQuery>();
        langQ->add(langPrimary, BooleanClause::SHOULD);
        langQ->add(langSecondary, BooleanClause::SHOULD);

        SuggestionsList results;

        const Lucene::String sourceField(L"source");
        auto boolQ = newLucene<BooleanQuery>();
        auto phraseQ = newLucene<PhraseQuery>();

        auto stream = m_analyzer->tokenStream(sourceField, newLucene<StringReader>(source));
        int sourceTokensCount = 0;
        int sourceTokenPosition = -1;
        while (stream->incrementToken())
        {
            sourceTokensCount++;
            auto word = stream->getAttribute<TermAttribute>()->term();
            sourceTokenPosition += stream->getAttribute<PositionIncrementAttribute>()->getPositionIncrement();
            auto term = newLucene<Term>(sourceField, word);
            boolQ->add(newLucene<TermQuery>(term), BooleanClause::SHOULD);
            phraseQ->add(term, sourceTokenPosition);
        }

        auto searcher = m_mng->Searcher();

        // Try exact phrase first:
        PerformSearch(searcher.ptr(), srclangQ, langQ, source, phraseQ, results,
                      QUALITY_THRESHOLD, /*scoreScaling=*/1.0);
        if (!results.empty())
            return results;

        // Then, if no matches were found, permit being a bit sloppy:
        phraseQ->setSlop(1);
        PerformSearch(searcher.ptr(), srclangQ, langQ, source, phraseQ, results,
                      QUALITY_THRESHOLD, /*scoreScaling=*/0.9);

        if (!results.empty())
            return results;

        // As the last resort, try terms search. This will almost certainly
        // produce low-quality results, but hopefully better than nothing.
        boolQ->setMinimumNumberShouldMatch(std::max(1, boolQ->getClauses().size() - MAX_ALLOWED_LENGTH_DIFFERENCE));
        PerformSearchWithBlock
        (
            searcher.ptr(), srclangQ, langQ, source, boolQ,
            QUALITY_THRESHOLD, /*scoreScaling=*/0.8,
            [=,&results](DocumentPtr doc, double score)
            {
                auto s = get_text_field(doc, sourceField);
                auto t = get_text_field(doc, L"trans");
                auto stream2 = m_analyzer->tokenStream(sourceField, newLucene<StringReader>(s));
                int tokensCount2 = 0;
                while (stream2->incrementToken())
                    tokensCount2++;

                if (std::abs(tokensCount2 - sourceTokensCount) <= MAX_ALLOWED_LENGTH_DIFFERENCE &&
                    !ContainsResult(results, t))
                {
                    time_t ts = DateField::stringToTime(doc->get(L"created"));
                    Suggestion r {t, score, int(ts)};
                    r.id = StringUtils::toUTF8(doc->get(L"uuid"));
                    results.push_back(r);
                }
            }
        );

        std::stable_sort(results.begin(), results.end());
        return results;
    }
    catch (LuceneException&)
    {
        return SuggestionsList();
    }
}
Ejemplo n.º 3
0
 CaseMismatch(Language lang) : m_lang(lang.Lang())
 {
 }
Ejemplo n.º 4
0
 PunctuationMismatch(Language lang) : m_lang(lang.Lang())
 {
 }