/** * Creates a new dicNode that represents a space insertion at the end of the input dicNode. Also * incorporates the unigram / bigram score for the ending word into the new dicNode. */ void Suggest::createNextWordDicNode(DicTraverseSession *traverseSession, DicNode *dicNode, const bool spaceSubstitution) const { const WordAttributes wordAttributes = traverseSession->getDictionaryStructurePolicy()->getWordAttributesInContext( dicNode->getPrevWordIds(), dicNode->getWordId(), traverseSession->getMultiBigramMap()); if (SuggestionsOutputUtils::shouldBlockWord(traverseSession->getSuggestOptions(), dicNode, wordAttributes, false /* isLastWord */)) { return; } if (!TRAVERSAL->isGoodToTraverseNextWord(dicNode, wordAttributes.getProbability())) { return; } // Create a non-cached node here. DicNode newDicNode; DicNodeUtils::initAsRootWithPreviousWord( traverseSession->getDictionaryStructurePolicy(), dicNode, &newDicNode); const CorrectionType correctionType = spaceSubstitution ? CT_NEW_WORD_SPACE_SUBSTITUTION : CT_NEW_WORD_SPACE_OMISSION; Weighting::addCostAndForwardInputIndex(WEIGHTING, correctionType, traverseSession, dicNode, &newDicNode, traverseSession->getMultiBigramMap()); if (newDicNode.getCompoundDistance() < static_cast<float>(MAX_VALUE_FOR_WEIGHTING)) { // newDicNode is worth continuing to traverse. // CAVEAT: This pruning is important for speed. Remove this when we can afford not to prune // here because here is not the right place to do pruning. Pruning should take place only // in DicNodePriorityQueue. traverseSession->getDicTraverseCache()->copyPushNextActive(&newDicNode); } }
int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const { if (wordId == NOT_A_WORD_ID || prevWordIds.contains(NOT_A_WORD_ID)) { return NOT_A_PROBABILITY; } const WordAttributes wordAttributes = mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId, true /* mustMatchAllPrevWords */, mHeaderPolicy); if (wordAttributes.isBlacklisted() || wordAttributes.isNotAWord()) { return NOT_A_PROBABILITY; } return wordAttributes.getProbability(); }
/* static */ int DictionaryUtils::getMaxProbabilityOfExactMatches( const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, const CodePointArrayView codePoints) { std::vector<DicNode> current; std::vector<DicNode> next; // No ngram context. NgramContext emptyNgramContext; WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray; const WordIdArrayView prevWordIds = emptyNgramContext.getPrevWordIds( dictionaryStructurePolicy, &prevWordIdArray, false /* tryLowerCaseSearch */); current.emplace_back(); DicNodeUtils::initAsRoot(dictionaryStructurePolicy, prevWordIds, ¤t.front()); for (const int codePoint : codePoints) { // The base-lower input is used to ignore case errors and accent errors. const int baseLowerCodePoint = CharUtils::toBaseLowerCase(codePoint); for (const DicNode &dicNode : current) { if (dicNode.isInDigraph() && dicNode.getNodeCodePoint() == baseLowerCodePoint) { next.emplace_back(dicNode); next.back().advanceDigraphIndex(); continue; } processChildDicNodes(dictionaryStructurePolicy, baseLowerCodePoint, &dicNode, &next); } current.clear(); current.swap(next); } int maxProbability = NOT_A_PROBABILITY; for (const DicNode &dicNode : current) { if (!dicNode.isTerminalDicNode()) { continue; } const WordAttributes wordAttributes = dictionaryStructurePolicy->getWordAttributesInContext(dicNode.getPrevWordIds(), dicNode.getWordId(), nullptr /* multiBigramMap */); // dicNode can contain case errors, accent errors, intentional omissions or digraphs. maxProbability = std::max(maxProbability, wordAttributes.getProbability()); } return maxProbability; }
/* static */ bool SuggestionsOutputUtils::shouldBlockWord( const SuggestOptions *const suggestOptions, const DicNode *const terminalDicNode, const WordAttributes wordAttributes, const bool isLastWord) { const bool currentWordExactMatch = ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes()); // When we have to block offensive words, non-exact matched offensive words should not be // output. const bool shouldBlockOffensiveWords = suggestOptions->blockOffensiveWords(); const bool isBlockedOffensiveWord = shouldBlockOffensiveWords && wordAttributes.isPossiblyOffensive(); // This function is called in two situations: // // 1) At the end of a search, in which case terminalDicNode will point to the last DicNode // of the search, and isLastWord will be true. // "f**k" // | // \ terminalDicNode (isLastWord=true, currentWordExactMatch=true) // In this case, if the current word is an exact match, we will always let the word // through, even if the user is blocking offensive words (it's exactly what they typed!) // // 2) In the middle of the search, when we hit a terminal node, to decide whether or not // to start a new search at root, to try to match the rest of the input. In this case, // terminalDicNode will point to the terminal node we just hit, and isLastWord will be // false. // "fuckvthis" // | // \ terminalDicNode (isLastWord=false, currentWordExactMatch=true) // // In this case, we should NOT allow the match through (correcting "fuckthis" to "f**k this" // when offensive words are blocked would be a bad idea). // // In the case of a multi-word correction where the offensive word is typed last (eg. // for the input "allfuck"), this function will be called with isLastWord==true, but // currentWordExactMatch==false. So we are OK in this case as well. // "allfuck" // | // \ terminalDicNode (isLastWord=true, currentWordExactMatch=false) if (isLastWord && currentWordExactMatch) { return false; } else { return isBlockedOffensiveWord; } }
const WordProperty Ver4PatriciaTriePolicy::getWordProperty( const CodePointArrayView wordCodePoints) const { const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); if (wordId == NOT_A_WORD_ID) { AKLOGE("getWordProperty is called for invalid word."); return WordProperty(); } const LanguageModelDictContent *const languageModelDictContent = mBuffers->getLanguageModelDictContent(); // Fetch ngram information. std::vector<NgramProperty> ngrams; int ngramTargetCodePoints[MAX_WORD_LENGTH]; int ngramPrevWordsCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH]; int ngramPrevWordsCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; bool ngramPrevWordIsBeginningOfSentense[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; for (const auto entry : languageModelDictContent->exportAllNgramEntriesRelatedToWord( mHeaderPolicy, wordId)) { const int codePointCount = getCodePointsAndReturnCodePointCount(entry.getTargetWordId(), MAX_WORD_LENGTH, ngramTargetCodePoints); const WordIdArrayView prevWordIds = entry.getPrevWordIds(); for (size_t i = 0; i < prevWordIds.size(); ++i) { ngramPrevWordsCodePointCount[i] = getCodePointsAndReturnCodePointCount(prevWordIds[i], MAX_WORD_LENGTH, ngramPrevWordsCodePoints[i]); ngramPrevWordIsBeginningOfSentense[i] = languageModelDictContent->getProbabilityEntry( prevWordIds[i]).representsBeginningOfSentence(); if (ngramPrevWordIsBeginningOfSentense[i]) { ngramPrevWordsCodePointCount[i] = CharUtils::removeBeginningOfSentenceMarker( ngramPrevWordsCodePoints[i], ngramPrevWordsCodePointCount[i]); } } const NgramContext ngramContext(ngramPrevWordsCodePoints, ngramPrevWordsCodePointCount, ngramPrevWordIsBeginningOfSentense, prevWordIds.size()); const ProbabilityEntry ngramProbabilityEntry = entry.getProbabilityEntry(); const HistoricalInfo *const historicalInfo = ngramProbabilityEntry.getHistoricalInfo(); // TODO: Output flags in WordAttributes. ngrams.emplace_back(ngramContext, CodePointArrayView(ngramTargetCodePoints, codePointCount).toVector(), entry.getWordAttributes().getProbability(), *historicalInfo); } // Fetch shortcut information. std::vector<UnigramProperty::ShortcutProperty> shortcuts; int shortcutPos = getShortcutPositionOfWord(wordId); if (shortcutPos != NOT_A_DICT_POS) { int shortcutTarget[MAX_WORD_LENGTH]; const ShortcutDictContent *const shortcutDictContent = mBuffers->getShortcutDictContent(); bool hasNext = true; while (hasNext) { int shortcutTargetLength = 0; int shortcutProbability = NOT_A_PROBABILITY; shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget, &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos); shortcuts.emplace_back( CodePointArrayView(shortcutTarget, shortcutTargetLength).toVector(), shortcutProbability); } } const WordAttributes wordAttributes = languageModelDictContent->getWordAttributes( WordIdArrayView(), wordId, true /* mustMatchAllPrevWords */, mHeaderPolicy); const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry(wordId); const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(), wordAttributes.isNotAWord(), wordAttributes.isBlacklisted(), wordAttributes.isPossiblyOffensive(), wordAttributes.getProbability(), *historicalInfo, std::move(shortcuts)); return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams); }
/* static */ void SuggestionsOutputUtils::outputSuggestionsOfDicNode( const Scoring *const scoringPolicy, DicTraverseSession *traverseSession, const DicNode *const terminalDicNode, const float weightOfLangModelVsSpatialModel, const bool boostExactMatches, const bool forceCommitMultiWords, const bool outputSecondWordFirstLetterInputIndex, SuggestionResults *const outSuggestionResults) { if (DEBUG_GEO_FULL) { terminalDicNode->dump("OUT:"); } const float doubleLetterCost = scoringPolicy->getDoubleLetterDemotionDistanceCost(terminalDicNode); const float compoundDistance = terminalDicNode->getCompoundDistance(weightOfLangModelVsSpatialModel) + doubleLetterCost; const WordAttributes wordAttributes = traverseSession->getDictionaryStructurePolicy() ->getWordAttributesInContext(terminalDicNode->getPrevWordIds(), terminalDicNode->getWordId(), nullptr /* multiBigramMap */); const bool isExactMatch = ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes()); const bool isExactMatchWithIntentionalOmission = ErrorTypeUtils::isExactMatchWithIntentionalOmission( terminalDicNode->getContainedErrorTypes()); // TODO: Decide whether the word should be auto-corrected or not here. const bool isAppropriateForAutoCorrection = !ErrorTypeUtils::isMissingExplicitAccent( terminalDicNode->getContainedErrorTypes()); const int outputTypeFlags = (wordAttributes.isPossiblyOffensive() ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0) | ((isExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0) | (isExactMatchWithIntentionalOmission ? Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0) | (isAppropriateForAutoCorrection ? Dictionary::KIND_FLAG_APPROPRIATE_FOR_AUTOCORRECTION : 0); // Entries that are blacklisted or do not represent a word should not be output. const bool isValidWord = !(wordAttributes.isBlacklisted() || wordAttributes.isNotAWord()); const bool shouldBlockThisWord = shouldBlockWord(traverseSession->getSuggestOptions(), terminalDicNode, wordAttributes, true /* isLastWord */); // Increase output score of top typing suggestion to ensure autocorrection. // TODO: Better integration with java side autocorrection logic. const int finalScore = scoringPolicy->calculateFinalScore( compoundDistance, traverseSession->getInputSize(), terminalDicNode->getContainedErrorTypes(), (forceCommitMultiWords && terminalDicNode->hasMultipleWords()), boostExactMatches, wordAttributes.getProbability() == 0); // Don't output invalid or blocked offensive words. However, we still need to submit their // shortcuts if any. if (isValidWord && !shouldBlockThisWord) { int codePoints[MAX_WORD_LENGTH]; terminalDicNode->outputResult(codePoints); const int indexToPartialCommit = outputSecondWordFirstLetterInputIndex ? terminalDicNode->getSecondWordFirstInputIndex( traverseSession->getProximityInfoState(0)) : NOT_AN_INDEX; outSuggestionResults->addSuggestion(codePoints, terminalDicNode->getTotalNodeCodePointCount(), finalScore, Dictionary::KIND_CORRECTION | outputTypeFlags, indexToPartialCommit, computeFirstWordConfidence(terminalDicNode)); } // Output shortcuts. // Shortcut is not supported for multiple words suggestions. // TODO: Check shortcuts during traversal for multiple words suggestions. if (!terminalDicNode->hasMultipleWords()) { BinaryDictionaryShortcutIterator shortcutIt = traverseSession->getDictionaryStructurePolicy()->getShortcutIterator( terminalDicNode->getWordId()); const bool sameAsTyped = scoringPolicy->sameAsTyped(traverseSession, terminalDicNode); outputShortcuts(&shortcutIt, finalScore, sameAsTyped, outSuggestionResults); } }