Language Language::TryParseWithValidation(const std::wstring& s) { Language lang = Language::TryParse(s); if (!lang.IsValid()) return Language(); // invalid if (!IsISOLanguage(lang.Lang())) return Language(); // invalid auto country = lang.Country(); if (!country.empty() && !IsISOCountry(country)) return Language(); // invalid return lang; }
SuggestionsList TranslationMemoryImpl::Search(const Language& srclang, const Language& lang, const std::wstring& source) { try { // TODO: query by srclang too! auto srclangQ = newLucene<TermQuery>(newLucene<Term>(L"srclang", srclang.WCode())); const Lucene::String fullLang = lang.WCode(); const Lucene::String shortLang = StringUtils::toUnicode(lang.Lang()); QueryPtr langPrimary = newLucene<TermQuery>(newLucene<Term>(L"lang", fullLang)); QueryPtr langSecondary; if (fullLang == shortLang) { // for e.g. 'cs', search also 'cs_*' (e.g. 'cs_CZ') langSecondary = newLucene<PrefixQuery>(newLucene<Term>(L"lang", shortLang + L"_")); } else { // search short variants of the language too langSecondary = newLucene<TermQuery>(newLucene<Term>(L"lang", shortLang)); } langSecondary->setBoost(0.85); auto langQ = newLucene<BooleanQuery>(); langQ->add(langPrimary, BooleanClause::SHOULD); langQ->add(langSecondary, BooleanClause::SHOULD); SuggestionsList results; const Lucene::String sourceField(L"source"); auto boolQ = newLucene<BooleanQuery>(); auto phraseQ = newLucene<PhraseQuery>(); auto stream = m_analyzer->tokenStream(sourceField, newLucene<StringReader>(source)); int sourceTokensCount = 0; int sourceTokenPosition = -1; while (stream->incrementToken()) { sourceTokensCount++; auto word = stream->getAttribute<TermAttribute>()->term(); sourceTokenPosition += stream->getAttribute<PositionIncrementAttribute>()->getPositionIncrement(); auto term = newLucene<Term>(sourceField, word); boolQ->add(newLucene<TermQuery>(term), BooleanClause::SHOULD); phraseQ->add(term, sourceTokenPosition); } auto searcher = m_mng->Searcher(); // Try exact phrase first: PerformSearch(searcher.ptr(), srclangQ, langQ, source, phraseQ, results, QUALITY_THRESHOLD, /*scoreScaling=*/1.0); if (!results.empty()) return results; // Then, if no matches were found, permit being a bit sloppy: phraseQ->setSlop(1); PerformSearch(searcher.ptr(), srclangQ, langQ, source, phraseQ, results, QUALITY_THRESHOLD, /*scoreScaling=*/0.9); if (!results.empty()) return results; // As the last resort, try terms search. This will almost certainly // produce low-quality results, but hopefully better than nothing. boolQ->setMinimumNumberShouldMatch(std::max(1, boolQ->getClauses().size() - MAX_ALLOWED_LENGTH_DIFFERENCE)); PerformSearchWithBlock ( searcher.ptr(), srclangQ, langQ, source, boolQ, QUALITY_THRESHOLD, /*scoreScaling=*/0.8, [=,&results](DocumentPtr doc, double score) { auto s = get_text_field(doc, sourceField); auto t = get_text_field(doc, L"trans"); auto stream2 = m_analyzer->tokenStream(sourceField, newLucene<StringReader>(s)); int tokensCount2 = 0; while (stream2->incrementToken()) tokensCount2++; if (std::abs(tokensCount2 - sourceTokensCount) <= MAX_ALLOWED_LENGTH_DIFFERENCE && !ContainsResult(results, t)) { time_t ts = DateField::stringToTime(doc->get(L"created")); Suggestion r {t, score, int(ts)}; r.id = StringUtils::toUTF8(doc->get(L"uuid")); results.push_back(r); } } ); std::stable_sort(results.begin(), results.end()); return results; } catch (LuceneException&) { return SuggestionsList(); } }
CaseMismatch(Language lang) : m_lang(lang.Lang()) { }
PunctuationMismatch(Language lang) : m_lang(lang.Lang()) { }