bool ProximityInfo::hasSpaceProximity(const int x, const int y) const { if (x < 0 || y < 0) { if (DEBUG_DICT) { AKLOGI("HasSpaceProximity: Illegal coordinates (%d, %d)", x, y); // TODO: Enable this assertion. //ASSERT(false); } return false; } const int startIndex = ProximityInfoUtils::getStartIndexFromCoordinates(x, y, CELL_HEIGHT, CELL_WIDTH, GRID_WIDTH); if (DEBUG_PROXIMITY_INFO) { AKLOGI("hasSpaceProximity: index %d, %d, %d", startIndex, x, y); } int *proximityCharsArray = mProximityCharsArray; for (int i = 0; i < MAX_PROXIMITY_CHARS_SIZE; ++i) { if (DEBUG_PROXIMITY_INFO) { AKLOGI("Index: %d", mProximityCharsArray[startIndex + i]); } if (proximityCharsArray[startIndex + i] == KEYCODE_SPACE) { return true; } } return false; }
ProximityInfo::ProximityInfo(JNIEnv *env, const jstring localeJStr, const int keyboardWidth, const int keyboardHeight, const int gridWidth, const int gridHeight, const int mostCommonKeyWidth, const int mostCommonKeyHeight, const jintArray proximityChars, const int keyCount, const jintArray keyXCoordinates, const jintArray keyYCoordinates, const jintArray keyWidths, const jintArray keyHeights, const jintArray keyCharCodes, const jfloatArray sweetSpotCenterXs, const jfloatArray sweetSpotCenterYs, const jfloatArray sweetSpotRadii) : GRID_WIDTH(gridWidth), GRID_HEIGHT(gridHeight), MOST_COMMON_KEY_WIDTH(mostCommonKeyWidth), MOST_COMMON_KEY_WIDTH_SQUARE(mostCommonKeyWidth * mostCommonKeyWidth), MOST_COMMON_KEY_HEIGHT(mostCommonKeyHeight), NORMALIZED_SQUARED_MOST_COMMON_KEY_HYPOTENUSE(1.0f + SQUARE_FLOAT(static_cast<float>(mostCommonKeyHeight) / static_cast<float>(mostCommonKeyWidth))), CELL_WIDTH((keyboardWidth + gridWidth - 1) / gridWidth), CELL_HEIGHT((keyboardHeight + gridHeight - 1) / gridHeight), KEY_COUNT(min(keyCount, MAX_KEY_COUNT_IN_A_KEYBOARD)), KEYBOARD_WIDTH(keyboardWidth), KEYBOARD_HEIGHT(keyboardHeight), KEYBOARD_HYPOTENUSE(hypotf(KEYBOARD_WIDTH, KEYBOARD_HEIGHT)), HAS_TOUCH_POSITION_CORRECTION_DATA(keyCount > 0 && keyXCoordinates && keyYCoordinates && keyWidths && keyHeights && keyCharCodes && sweetSpotCenterXs && sweetSpotCenterYs && sweetSpotRadii), mProximityCharsArray(new int[GRID_WIDTH * GRID_HEIGHT * MAX_PROXIMITY_CHARS_SIZE /* proximityCharsLength */]), mCodeToKeyMap() { /* Let's check the input array length here to make sure */ const jsize proximityCharsLength = env->GetArrayLength(proximityChars); if (proximityCharsLength != GRID_WIDTH * GRID_HEIGHT * MAX_PROXIMITY_CHARS_SIZE) { AKLOGE("Invalid proximityCharsLength: %d", proximityCharsLength); ASSERT(false); return; } if (DEBUG_PROXIMITY_INFO) { AKLOGI("Create proximity info array %d", proximityCharsLength); } const jsize localeCStrUtf8Length = env->GetStringUTFLength(localeJStr); if (localeCStrUtf8Length >= MAX_LOCALE_STRING_LENGTH) { AKLOGI("Locale string length too long: length=%d", localeCStrUtf8Length); ASSERT(false); } memset(mLocaleStr, 0, sizeof(mLocaleStr)); env->GetStringUTFRegion(localeJStr, 0, env->GetStringLength(localeJStr), mLocaleStr); safeGetOrFillZeroIntArrayRegion(env, proximityChars, proximityCharsLength, mProximityCharsArray); safeGetOrFillZeroIntArrayRegion(env, keyXCoordinates, KEY_COUNT, mKeyXCoordinates); safeGetOrFillZeroIntArrayRegion(env, keyYCoordinates, KEY_COUNT, mKeyYCoordinates); safeGetOrFillZeroIntArrayRegion(env, keyWidths, KEY_COUNT, mKeyWidths); safeGetOrFillZeroIntArrayRegion(env, keyHeights, KEY_COUNT, mKeyHeights); safeGetOrFillZeroIntArrayRegion(env, keyCharCodes, KEY_COUNT, mKeyCodePoints); safeGetOrFillZeroFloatArrayRegion(env, sweetSpotCenterXs, KEY_COUNT, mSweetSpotCenterXs); safeGetOrFillZeroFloatArrayRegion(env, sweetSpotCenterYs, KEY_COUNT, mSweetSpotCenterYs); safeGetOrFillZeroFloatArrayRegion(env, sweetSpotRadii, KEY_COUNT, mSweetSpotRadii); initializeG(); }
BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength, Dictionary *parentDictionary) : DICT(dict), MAX_WORD_LENGTH(maxWordLength), mParentDictionary(parentDictionary) { if (DEBUG_DICT) { AKLOGI("BigramDictionary - constructor"); } }
bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const int length0, const int *const word1, const int length1) { if (!mBuffer->isUpdatable()) { AKLOGI("Warning: removeBigramWords() is called for non-updatable dictionary."); return false; } if (mBufferWithExtendableBuffer.getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { AKLOGE("The dictionary is too large to dynamically update."); return false; } const int word0Pos = getTerminalNodePositionOfWord(word0, length0, false /* forceLowerCaseSearch */); if (word0Pos == NOT_A_DICT_POS) { return false; } const int word1Pos = getTerminalNodePositionOfWord(word1, length1, false /* forceLowerCaseSearch */); if (word1Pos == NOT_A_DICT_POS) { return false; } DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()); if (writingHelper.removeBigramWords(word0Pos, word1Pos)) { mBigramCount--; return true; } else { return false; } }
bool DynamicPatriciaTriePolicy::addUnigramWord(const int *const word, const int length, const int probability) { if (!mBuffer->isUpdatable()) { AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary."); return false; } if (mBufferWithExtendableBuffer.getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { AKLOGE("The dictionary is too large to dynamically update."); return false; } DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); readingHelper.initWithPtNodeArrayPos(getRootPosition()); DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()); bool addedNewUnigram = false; if (writingHelper.addUnigramWord(&readingHelper, word, length, probability, &addedNewUnigram)) { if (addedNewUnigram) { mUnigramCount++; } return true; } else { return false; } }
void UnigramDictionary::getSuggestionCandidates(const bool useFullEditDistance, const int inputLength, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, Correction *correction, WordsPriorityQueuePool *queuePool, const bool doAutoCompletion, const int maxErrors, const int currentWordIndex) { uint8_t totalTraverseCount = correction->pushAndGetTotalTraverseCount(); if (DEBUG_DICT) { AKLOGI("Traverse count %d", totalTraverseCount); } if (totalTraverseCount > MULTIPLE_WORDS_SUGGESTION_MAX_TOTAL_TRAVERSE_COUNT) { if (DEBUG_DICT) { AKLOGI("Abort traversing %d", totalTraverseCount); } return; } // TODO: Remove setCorrectionParams correction->setCorrectionParams(0, 0, 0, -1 /* spaceProximityPos */, -1 /* missingSpacePos */, useFullEditDistance, doAutoCompletion, maxErrors); int rootPosition = ROOT_POS; // Get the number of children of root, then increment the position int childCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &rootPosition); int outputIndex = 0; correction->initCorrectionState(rootPosition, childCount, (inputLength <= 0)); // Depth first search while (outputIndex >= 0) { if (correction->initProcessState(outputIndex)) { int siblingPos = correction->getTreeSiblingPos(outputIndex); int firstChildPos; const bool needsToTraverseChildrenNodes = processCurrentNode(siblingPos, bigramMap, bigramFilter, correction, &childCount, &firstChildPos, &siblingPos, queuePool, currentWordIndex); // Update next sibling pos correction->setTreeSiblingPos(outputIndex, siblingPos); if (needsToTraverseChildrenNodes) { // Goes to child node outputIndex = correction->goDownTree(outputIndex, childCount, firstChildPos); } } else { // Goes to parent sibling node outputIndex = correction->getTreeParentIndex(outputIndex); } } }
bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext( const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints, const bool isValidWord, const HistoricalInfo historicalInfo) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable " "dictionary."); return false; } const bool updateAsAValidWord = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */) ? false : isValidWord; int wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */); if (wordId == NOT_A_WORD_ID) { // The word is not in the dictionary. const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */, false /* isNotAWord */, false /* isBlacklisted */, false /* isPossiblyOffensive */, NOT_A_PROBABILITY, HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */)); if (!addUnigramEntry(wordCodePoints, &unigramProperty)) { AKLOGE("Cannot add unigarm entry in updateEntriesForWordWithNgramContext()."); return false; } if (!isValidWord) { return true; } wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */); } WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray; const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) { if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) { const UnigramProperty beginningOfSentenceUnigramProperty( true /* representsBeginningOfSentence */, true /* isNotAWord */, false /* isPossiblyOffensive */, NOT_A_PROBABILITY, HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */)); if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */), &beginningOfSentenceUnigramProperty)) { AKLOGE("Cannot add BoS entry in updateEntriesForWordWithNgramContext()."); return false; } // Refresh word ids. ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); } // Update entries for beginning of sentence. if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord( prevWordIds.skip(1 /* n */), prevWordIds[0], true /* isVaild */, historicalInfo, mHeaderPolicy, &mEntryCounters)) { return false; } } if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord(prevWordIds, wordId, updateAsAValidWord, historicalInfo, mHeaderPolicy, &mEntryCounters)) { return false; } return true; }
void DynamicPatriciaTriePolicy::flush(const char *const filePath) { if (!mBuffer->isUpdatable()) { AKLOGI("Warning: flush() is called for non-updatable dictionary."); return; } DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, &mBigramListPolicy, &mShortcutListPolicy, false /* needsToDecay */); writingHelper.writeToDictFile(filePath, &mHeaderPolicy, mUnigramCount, mBigramCount); }
void UnigramDictionary::initSuggestions(ProximityInfo *proximityInfo, const int *xCoordinates, const int *yCoordinates, const int *codes, const int inputLength, Correction *correction) { if (DEBUG_DICT) { AKLOGI("initSuggest"); DUMP_WORD_INT(codes, inputLength); } proximityInfo->setInputParams(codes, inputLength, xCoordinates, yCoordinates); const int maxDepth = min(inputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH); correction->initCorrection(proximityInfo, inputLength, maxDepth); }
bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequency) { word[length] = 0; if (DEBUG_DICT) { #ifdef FLAG_DBG char s[length + 1]; for (int i = 0; i <= length; i++) s[i] = word[i]; AKLOGI("Bigram: Found word = %s, freq = %d :", s, frequency); #endif } // Find the right insertion point int insertAt = 0; while (insertAt < mMaxBigrams) { if (frequency > mBigramFreq[insertAt] || (mBigramFreq[insertAt] == frequency && length < Dictionary::wideStrLen(mBigramChars + insertAt * MAX_WORD_LENGTH))) { break; } insertAt++; } if (DEBUG_DICT) { AKLOGI("Bigram: InsertAt -> %d maxBigrams: %d", insertAt, mMaxBigrams); } if (insertAt < mMaxBigrams) { memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]), (char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]), (mMaxBigrams - insertAt - 1) * sizeof(mBigramFreq[0])); mBigramFreq[insertAt] = frequency; memmove((char*) mBigramChars + (insertAt + 1) * MAX_WORD_LENGTH * sizeof(short), (char*) mBigramChars + (insertAt ) * MAX_WORD_LENGTH * sizeof(short), (mMaxBigrams - insertAt - 1) * sizeof(short) * MAX_WORD_LENGTH); unsigned short *dest = mBigramChars + (insertAt ) * MAX_WORD_LENGTH; while (length--) { *dest++ = *word++; } *dest = 0; // NULL terminate if (DEBUG_DICT) { AKLOGI("Bigram: Added word at %d", insertAt); } return true; } return false; }
bool Ver4PatriciaTriePolicy::flush(const char *const filePath) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath); return false; } if (!mWritingHelper.writeToDictFile(filePath, mEntryCounters.getEntryCounts())) { AKLOGE("Cannot flush the dictionary to file."); mIsCorrupted = true; return false; } return true; }
bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); return false; } if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) { AKLOGE("Cannot flush the dictionary to file with GC."); mIsCorrupted = true; return false; } return true; }
/* static */ bool DynamicPtWritingUtils::writeCodePointsAndAdvancePosition( BufferWithExtendableBuffer *const buffer, const int *const codePoints, const int codePointCount, int *const codePointFieldPos) { if (codePointCount <= 0) { AKLOGI("code points cannot be written because codePointCount is invalid: %d", codePointCount); ASSERT(false); return false; } const bool hasMultipleCodePoints = codePointCount > 1; return buffer->writeCodePointsAndAdvancePosition(codePoints, codePointCount, hasMultipleCodePoints, codePointFieldPos); }
void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) { if (!mBuffer->isUpdatable()) { AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); return; } const bool needsToDecay = mHeaderPolicy.isDecayingDict() && (mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay( false /* mindsBlockByDecay */, mUnigramCount, mBigramCount, &mHeaderPolicy)); DynamicBigramListPolicy bigramListPolicyForGC(&mHeaderPolicy, &mBufferWithExtendableBuffer, &mShortcutListPolicy, needsToDecay); DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, &bigramListPolicyForGC, &mShortcutListPolicy, needsToDecay); writingHelper.writeToDictFileWithGC(getRootPosition(), filePath, &mHeaderPolicy); mNeedsToDecayForTesting = false; }
// Read node array size and process empty node arrays. Nodes and arrays are counted up in this // method to avoid an infinite loop. void DynamicPatriciaTrieReadingHelper::nextPtNodeArray() { if (mReadingState.mPos < 0 || mReadingState.mPos >= mBuffer->getTailPosition()) { // Reading invalid position because of a bug or a broken dictionary. AKLOGE("Reading PtNode array info from invalid dictionary position: %d, dict size: %d", mReadingState.mPos, mBuffer->getTailPosition()); ASSERT(false); mIsError = true; mReadingState.mPos = NOT_A_DICT_POS; return; } mReadingState.mPosOfLastPtNodeArrayHead = mReadingState.mPos; const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(mReadingState.mPos); const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); if (usesAdditionalBuffer) { mReadingState.mPos -= mBuffer->getOriginalBufferSize(); } mReadingState.mNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( dictBuf, &mReadingState.mPos); if (usesAdditionalBuffer) { mReadingState.mPos += mBuffer->getOriginalBufferSize(); } // Count up nodes and node arrays to avoid infinite loop. mReadingState.mTotalNodeCount += mReadingState.mNodeCount; mReadingState.mNodeArrayCount++; if (mReadingState.mNodeCount < 0 || mReadingState.mTotalNodeCount > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP || mReadingState.mNodeArrayCount > MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP) { // Invalid dictionary. AKLOGI("Invalid dictionary. nodeCount: %d, totalNodeCount: %d, MAX_CHILD_COUNT: %d" "nodeArrayCount: %d, MAX_NODE_ARRAY_COUNT: %d", mReadingState.mNodeCount, mReadingState.mTotalNodeCount, MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP, mReadingState.mNodeArrayCount, MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP); ASSERT(false); mIsError = true; mReadingState.mPos = NOT_A_DICT_POS; return; } if (mReadingState.mNodeCount == 0) { // Empty node array. Try following forward link. followForwardLink(); } }
/* static */ bool DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition( BufferWithExtendableBuffer *const buffer, const size_t arraySize, int *const arraySizeFieldPos) { // Currently, all array size field to be created has LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE to // simplify updating process. // TODO: Use SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE for small arrays. /*if (arraySize <= MAX_PTNODE_ARRAY_SIZE_TO_USE_SMALL_SIZE_FIELD) { return buffer->writeUintAndAdvancePosition(arraySize, SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE, arraySizeFieldPos); } else */ if (arraySize <= MAX_PTNODE_ARRAY_SIZE) { uint32_t data = arraySize | LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG; return buffer->writeUintAndAdvancePosition(data, LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE, arraySizeFieldPos); } else { AKLOGI("PtNode array size cannot be written because arraySize is too large: %zd", arraySize); ASSERT(false); return false; } }
bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); return false; } if (mBuffers->isNearSizeLimit()) { // Additional buffer size is near the limit. return true; } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize() > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) { // Total extended region size of the trie exceeds the limit. return true; } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS && mDictBuffer->getUsedAdditionalBufferSize() > 0) { // Needs to reduce dictionary size. return true; } else if (mHeaderPolicy->isDecayingDict()) { return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mEntryCounters.getEntryCounts(), mHeaderPolicy); } return false; }
/* static */ bool DynamicPtWritingUtils::writeDictOffset(BufferWithExtendableBuffer *const buffer, const int targetPos, const int basePos, int *const offsetFieldPos) { int offset = targetPos - basePos; if (targetPos == NOT_A_DICT_POS) { offset = DynamicPtReadingUtils::DICT_OFFSET_INVALID; } else if (offset == 0) { offset = DynamicPtReadingUtils::DICT_OFFSET_ZERO_OFFSET; } if (offset > MAX_DICT_OFFSET_VALUE || offset < MIN_DICT_OFFSET_VALUE) { AKLOGI("offset cannot be written because the offset is too large or too small: %d", offset); ASSERT(false); return false; } uint32_t data = 0; if (offset >= 0) { data = offset; } else { data = abs(offset) | DICT_OFFSET_NEGATIVE_FLAG; } return buffer->writeUintAndAdvancePosition(data, DICT_OFFSET_FIELD_SIZE, offsetFieldPos); }
void UnigramDictionary::getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, Correction *correction, WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate) { if (inputLength >= MAX_WORD_LENGTH) return; if (DEBUG_DICT) { AKLOGI("--- Suggest multiple words"); } // Allocating fixed length array on stack unsigned short outputWord[MAX_WORD_LENGTH]; int freqArray[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS]; int wordLengthArray[MULTIPLE_WORDS_SUGGESTION_MAX_WORDS]; const int outputWordLength = 0; const int startInputPos = 0; const int startWordIndex = 0; getMultiWordsSuggestionRec(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, inputLength, correction, queuePool, hasAutoCorrectionCandidate, startInputPos, startWordIndex, outputWordLength, freqArray, wordLengthArray, outputWord); }
bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary."); return false; } if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", mDictBuffer->getTailPosition()); return false; } if (!ngramContext->isValid()) { AKLOGE("Ngram context is not valid for removing n-gram entry form the dictionary."); return false; } if (wordCodePoints.size() > MAX_WORD_LENGTH) { AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd", wordCodePoints.size()); } WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray; const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSerch */); if (prevWordIds.empty() || prevWordIds.contains(NOT_A_WORD_ID)) { return false; } const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); if (wordId == NOT_A_WORD_ID) { return false; } if (mNodeWriter.removeNgramEntry(prevWordIds, wordId)) { mEntryCounters.decrementNgramCount( NgramUtils::getNgramTypeFromWordCount(prevWordIds.size() + 1)); return true; } else { return false; } }
bool DynamicPatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { if (!mBuffer->isUpdatable()) { AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); return false; } if (mBufferWithExtendableBuffer.isNearSizeLimit()) { // Additional buffer size is near the limit. return true; } else if (mHeaderPolicy.getExtendedRegionSize() + mBufferWithExtendableBuffer.getUsedAdditionalBufferSize() > MAX_DICT_EXTENDED_REGION_SIZE) { // Total extended region size exceeds the limit. return true; } else if (mBufferWithExtendableBuffer.getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS && mBufferWithExtendableBuffer.getUsedAdditionalBufferSize() > 0) { // Needs to reduce dictionary size. return true; } else if (mHeaderPolicy.isDecayingDict()) { return mNeedsToDecayForTesting || ForgettingCurveUtils::needsToDecay( mindsBlockByGC, mUnigramCount, mBigramCount, &mHeaderPolicy); } return false; }
bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCodePoints) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary."); return false; } const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); if (wordId == NOT_A_WORD_ID) { return false; } const int ptNodePos = mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); if (!mNodeWriter.markPtNodeAsDeleted(&ptNodeParams)) { AKLOGE("Cannot remove unigram. ptNodePos: %d", ptNodePos); return false; } if (!mBuffers->getMutableLanguageModelDictContent()->removeProbabilityEntry(wordId)) { return false; } if (!ptNodeParams.representsNonWordInfo()) { mEntryCounters.decrementNgramCount(NgramType::Unigram); } return true; }
// ProcessCurrentNode returns a boolean telling whether to traverse children nodes or not. // If the return value is false, then the caller should read in the output "nextSiblingPosition" // to find out the address of the next sibling node and pass it to a new call of processCurrentNode. // It is worthy to note that when false is returned, the output values other than // nextSiblingPosition are undefined. // If the return value is true, then the caller must proceed to traverse the children of this // node. processCurrentNode will output the information about the children: their count in // newCount, their position in newChildrenPosition, the traverseAllNodes flag in // newTraverseAllNodes, the match weight into newMatchRate, the input index into newInputIndex, the // diffs into newDiffs, the sibling position in nextSiblingPosition, and the output index into // newOutputIndex. Please also note the following caveat: processCurrentNode does not know when // there aren't any more nodes at this level, it merely returns the address of the first byte after // the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any // given level, as output into newCount when traversing this level's parent. inline bool UnigramDictionary::processCurrentNode(const int initialPos, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, Correction *correction, int *newCount, int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool, const int currentWordIndex) { if (DEBUG_DICT) { correction->checkState(); } int pos = initialPos; // Flags contain the following information: // - Address type (MASK_GROUP_ADDRESS_TYPE) on two bits: // - FLAG_GROUP_ADDRESS_TYPE_{ONE,TWO,THREE}_BYTES means there are children and their address // is on the specified number of bytes. // - FLAG_GROUP_ADDRESS_TYPE_NOADDRESS means there are no children, and therefore no address. // - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not. // - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children) // - FLAG_HAS_BIGRAMS: whether this node has bigrams or not const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos); const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags)); const bool isTerminalNode = (0 != (FLAG_IS_TERMINAL & flags)); bool needsToInvokeOnTerminal = false; // This gets only ONE character from the stream. Next there will be: // if FLAG_HAS_MULTIPLE CHARS: the other characters of the same node // else if FLAG_IS_TERMINAL: the frequency // else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address // Note that you can't have a node that both is not a terminal and has no children. int32_t c = BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos); assert(NOT_A_CHARACTER != c); // We are going to loop through each character and make it look like it's a different // node each time. To do that, we will process characters in this node in order until // we find the character terminator. This is signalled by getCharCode* returning // NOT_A_CHARACTER. // As a special case, if there is only one character in this node, we must not read the // next bytes so we will simulate the NOT_A_CHARACTER return by testing the flags. // This way, each loop run will look like a "virtual node". do { // We prefetch the next char. If 'c' is the last char of this node, we will have // NOT_A_CHARACTER in the next char. From this we can decide whether this virtual node // should behave as a terminal or not and whether we have children. const int32_t nextc = hasMultipleChars ? BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CHARACTER; const bool isLastChar = (NOT_A_CHARACTER == nextc); // If there are more chars in this nodes, then this virtual node is not a terminal. // If we are on the last char, this virtual node is a terminal if this node is. const bool isTerminal = isLastChar && isTerminalNode; Correction::CorrectionType stateType = correction->processCharAndCalcState( c, isTerminal); if (stateType == Correction::TRAVERSE_ALL_ON_TERMINAL || stateType == Correction::ON_TERMINAL) { needsToInvokeOnTerminal = true; } else if (stateType == Correction::UNRELATED || correction->needsToPrune()) { // We found that this is an unrelated character, so we should give up traversing // this node and its children entirely. // However we may not be on the last virtual node yet so we skip the remaining // characters in this node, the frequency if it's there, read the next sibling // position to output it, then return false. // We don't have to output other values because we return false, as in // "don't traverse children". if (!isLastChar) { pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos); } pos = BinaryFormat::skipFrequency(flags, pos); *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); return false; } // Prepare for the next character. Promote the prefetched char to current char - the loop // will take care of prefetching the next. If we finally found our last char, nextc will // contain NOT_A_CHARACTER. c = nextc; } while (NOT_A_CHARACTER != c); if (isTerminalNode) { // The frequency should be here, because we come here only if this is actually // a terminal node, and we are on its last char. const int unigramFreq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); const int childrenAddressPos = BinaryFormat::skipFrequency(flags, pos); const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos); TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos); // bigramMap contains the bigram frequencies indexed by addresses for fast lookup. // bigramFilter is a bloom filter of said frequencies for even faster rejection. const int probability = BinaryFormat::getProbability(initialPos, bigramMap, bigramFilter, unigramFreq); onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal, currentWordIndex); // If there are more chars in this node, then this virtual node has children. // If we are on the last char, this virtual node has children if this node has. const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags); // This character matched the typed character (enough to traverse the node at least) // so we just evaluated it. Now we should evaluate this virtual node's children - that // is, if it has any. If it has no children, we're done here - so we skip the end of // the node, output the siblings position, and return false "don't traverse children". // Note that !hasChildren implies isLastChar, so we know we don't have to skip any // remaining char in this group for there can't be any. if (!hasChildren) { pos = BinaryFormat::skipFrequency(flags, pos); *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); return false; } // Optimization: Prune out words that are too long compared to how much was typed. if (correction->needsToPrune()) { pos = BinaryFormat::skipFrequency(flags, pos); *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); if (DEBUG_DICT_FULL) { AKLOGI("Traversing was pruned."); } return false; } } // Now we finished processing this node, and we want to traverse children. If there are no // children, we can't come here. assert(BinaryFormat::hasChildrenInFlags(flags)); // If this node was a terminal it still has the frequency under the pointer (it may have been // read, but not skipped - see readFrequencyWithoutMovingPointer). // Next come the children position, then possibly attributes (attributes are bigrams only for // now, maybe something related to shortcuts in the future). // Once this is read, we still need to output the number of nodes in the immediate children of // this node, so we read and output it before returning true, as in "please traverse children". pos = BinaryFormat::skipFrequency(flags, pos); int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos); *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); *newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos); *newChildrenPosition = childrenPos; return true; }
void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const int inputLength, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, const bool useFullEditDistance, Correction *correction, WordsPriorityQueuePool *queuePool) { PROF_OPEN; PROF_START(0); PROF_END(0); PROF_START(1); getOneWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, bigramMap, bigramFilter, useFullEditDistance, inputLength, correction, queuePool); PROF_END(1); PROF_START(2); // Note: This line is intentionally left blank PROF_END(2); PROF_START(3); // Note: This line is intentionally left blank PROF_END(3); PROF_START(4); bool hasAutoCorrectionCandidate = false; WordsPriorityQueue* masterQueue = queuePool->getMasterQueue(); if (masterQueue->size() > 0) { float nsForMaster = masterQueue->getHighestNormalizedScore( proximityInfo->getPrimaryInputWord(), inputLength, 0, 0, 0); hasAutoCorrectionCandidate = (nsForMaster > START_TWO_WORDS_CORRECTION_THRESHOLD); } PROF_END(4); PROF_START(5); // Multiple word suggestions if (SUGGEST_MULTIPLE_WORDS && inputLength >= MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION) { getSplitMultipleWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, inputLength, correction, queuePool, hasAutoCorrectionCandidate); } PROF_END(5); PROF_START(6); // Note: This line is intentionally left blank PROF_END(6); if (DEBUG_DICT) { queuePool->dumpSubQueue1TopSuggestions(); for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { WordsPriorityQueue* queue = queuePool->getSubQueue(FIRST_WORD_INDEX, i); if (queue->size() > 0) { WordsPriorityQueue::SuggestedWord* sw = queue->top(); const int score = sw->mScore; const unsigned short* word = sw->mWord; const int wordLength = sw->mWordLength; float ns = Correction::RankingAlgorithm::calcNormalizedScore( proximityInfo->getPrimaryInputWord(), i, word, wordLength, score); ns += 0; AKLOGI("--- TOP SUB WORDS for %d --- %d %f [%d]", i, score, ns, (ns > TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD)); DUMP_WORD(proximityInfo->getPrimaryInputWord(), i); DUMP_WORD(word, wordLength); } } } }
int UnigramDictionary::getSubStringSuggestion( ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, Correction *correction, WordsPriorityQueuePool* queuePool, const int inputLength, const bool hasAutoCorrectionCandidate, const int currentWordIndex, const int inputWordStartPos, const int inputWordLength, const int outputWordStartPos, const bool isSpaceProximity, int *freqArray, int*wordLengthArray, unsigned short* outputWord, int *outputWordLength) { if (inputWordLength > MULTIPLE_WORDS_SUGGESTION_MAX_WORD_LENGTH) { return FLAG_MULTIPLE_SUGGEST_ABORT; } ///////////////////////////////////////////// // safety net for multiple word suggestion // // TODO: Remove this safety net // ///////////////////////////////////////////// int smallWordCount = 0; int singleLetterWordCount = 0; if (inputWordLength == 1) { ++singleLetterWordCount; } if (inputWordLength <= 2) { // small word == single letter or 2-letter word ++smallWordCount; } for (int i = 0; i < currentWordIndex; ++i) { const int length = wordLengthArray[i]; if (length == 1) { ++singleLetterWordCount; // Safety net to avoid suggesting sequential single letter words if (i < (currentWordIndex - 1)) { if (wordLengthArray[i + 1] == 1) { return FLAG_MULTIPLE_SUGGEST_ABORT; } } else if (inputWordLength == 1) { return FLAG_MULTIPLE_SUGGEST_ABORT; } } if (length <= 2) { ++smallWordCount; } // Safety net to avoid suggesting multiple words with many (4 or more, for now) small words if (singleLetterWordCount >= 3 || smallWordCount >= 4) { return FLAG_MULTIPLE_SUGGEST_ABORT; } } ////////////////////////////////////////////// // TODO: Remove the safety net above // ////////////////////////////////////////////// unsigned short* tempOutputWord = 0; int nextWordLength = 0; // TODO: Optimize init suggestion initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction); int freq = getMostFrequentWordLike( inputWordStartPos, inputWordLength, proximityInfo, mWord); if (freq > 0) { nextWordLength = inputWordLength; tempOutputWord = mWord; } else if (!hasAutoCorrectionCandidate) { if (inputWordStartPos > 0) { const int offset = inputWordStartPos; initSuggestions(proximityInfo, &xcoordinates[offset], &ycoordinates[offset], codes + offset, inputWordLength, correction); queuePool->clearSubQueue(currentWordIndex); // TODO: pass the bigram list for substring suggestion getSuggestionCandidates(useFullEditDistance, inputWordLength, 0 /* bigramMap */, 0 /* bigramFilter */, correction, queuePool, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS, currentWordIndex); if (DEBUG_DICT) { if (currentWordIndex < MULTIPLE_WORDS_SUGGESTION_MAX_WORDS) { AKLOGI("Dump word candidates(%d) %d", currentWordIndex, inputWordLength); for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) { queuePool->getSubQueue(currentWordIndex, i)->dumpTopWord(); } } } } WordsPriorityQueue* queue = queuePool->getSubQueue(currentWordIndex, inputWordLength); // TODO: Return the correct value depending on doAutoCompletion if (!queue || queue->size() <= 0) { return FLAG_MULTIPLE_SUGGEST_ABORT; } int score = 0; const float ns = queue->getHighestNormalizedScore( proximityInfo->getPrimaryInputWord(), inputWordLength, &tempOutputWord, &score, &nextWordLength); if (DEBUG_DICT) { AKLOGI("NS(%d) = %f, Score = %d", currentWordIndex, ns, score); } // Two words correction won't be done if the score of the first word doesn't exceed the // threshold. if (ns < TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD || nextWordLength < SUB_QUEUE_MIN_WORD_LENGTH) { return FLAG_MULTIPLE_SUGGEST_SKIP; } freq = score >> (nextWordLength + TWO_WORDS_PLUS_OTHER_ERROR_CORRECTION_DEMOTION_DIVIDER); } if (DEBUG_DICT) { AKLOGI("Freq(%d): %d, length: %d, input length: %d, input start: %d (%d)" , currentWordIndex, freq, nextWordLength, inputWordLength, inputWordStartPos, wordLengthArray[0]); } if (freq <= 0 || nextWordLength <= 0 || MAX_WORD_LENGTH <= (outputWordStartPos + nextWordLength)) { return FLAG_MULTIPLE_SUGGEST_SKIP; } for (int i = 0; i < nextWordLength; ++i) { outputWord[outputWordStartPos + i] = tempOutputWord[i]; } // Put output values freqArray[currentWordIndex] = freq; // TODO: put output length instead of input length wordLengthArray[currentWordIndex] = inputWordLength; const int tempOutputWordLength = outputWordStartPos + nextWordLength; if (outputWordLength) { *outputWordLength = tempOutputWordLength; } if ((inputWordStartPos + inputWordLength) < inputLength) { if (outputWordStartPos + nextWordLength >= MAX_WORD_LENGTH) { return FLAG_MULTIPLE_SUGGEST_SKIP; } outputWord[tempOutputWordLength] = SPACE; if (outputWordLength) { ++*outputWordLength; } } else if (currentWordIndex >= 1) { // TODO: Handle 3 or more words const int pairFreq = correction->getFreqForSplitMultipleWords( freqArray, wordLengthArray, currentWordIndex + 1, isSpaceProximity, outputWord); if (DEBUG_DICT) { DUMP_WORD(outputWord, tempOutputWordLength); for (int i = 0; i < currentWordIndex + 1; ++i) { AKLOGI("Split %d,%d words: freq = %d, length = %d", i, currentWordIndex + 1, freqArray[i], wordLengthArray[i]); } AKLOGI("Split two words: freq = %d, length = %d, %d, isSpace ? %d", pairFreq, inputLength, tempOutputWordLength, isSpaceProximity); } addWord(outputWord, tempOutputWordLength, pairFreq, queuePool->getMasterQueue()); } return FLAG_MULTIPLE_SUGGEST_CONTINUE; }
void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, Correction *correction, WordsPriorityQueuePool* queuePool, const bool hasAutoCorrectionCandidate, const int startInputPos, const int startWordIndex, const int outputWordLength, int *freqArray, int* wordLengthArray, unsigned short* outputWord) { if (startWordIndex >= (MULTIPLE_WORDS_SUGGESTION_MAX_WORDS - 1)) { // Return if the last word index return; } if (startWordIndex >= 1 && (hasAutoCorrectionCandidate || inputLength < MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION)) { // Do not suggest 3+ words if already has auto correction candidate return; } for (int i = startInputPos + 1; i < inputLength; ++i) { if (DEBUG_CORRECTION_FREQ) { AKLOGI("Multi words(%d), start in %d sep %d start out %d", startWordIndex, startInputPos, i, outputWordLength); DUMP_WORD(outputWord, outputWordLength); } int tempOutputWordLength = 0; // Current word int inputWordStartPos = startInputPos; int inputWordLength = i - startInputPos; const int suggestionFlag = getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate, startWordIndex, inputWordStartPos, inputWordLength, outputWordLength, true /* not used */, freqArray, wordLengthArray, outputWord, &tempOutputWordLength); if (suggestionFlag == FLAG_MULTIPLE_SUGGEST_ABORT) { // TODO: break here continue; } else if (suggestionFlag == FLAG_MULTIPLE_SUGGEST_SKIP) { continue; } if (DEBUG_CORRECTION_FREQ) { AKLOGI("Do missing space correction"); } // Next word // Missing space inputWordStartPos = i; inputWordLength = inputLength - i; if(getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate, startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength, false /* missing space */, freqArray, wordLengthArray, outputWord, 0) != FLAG_MULTIPLE_SUGGEST_CONTINUE) { getMultiWordsSuggestionRec(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, inputLength, correction, queuePool, hasAutoCorrectionCandidate, inputWordStartPos, startWordIndex + 1, tempOutputWordLength, freqArray, wordLengthArray, outputWord); } // Mistyped space ++inputWordStartPos; --inputWordLength; if (inputWordLength <= 0) { continue; } const int x = xcoordinates[inputWordStartPos - 1]; const int y = ycoordinates[inputWordStartPos - 1]; if (!proximityInfo->hasSpaceProximity(x, y)) { continue; } if (DEBUG_CORRECTION_FREQ) { AKLOGI("Do mistyped space correction"); } getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes, useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate, startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength, true /* mistyped space */, freqArray, wordLengthArray, outputWord, 0); } }
bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramProperty) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary."); return false; } if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", mDictBuffer->getTailPosition()); return false; } const NgramContext *const ngramContext = ngramProperty->getNgramContext(); if (!ngramContext->isValid()) { AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary."); return false; } if (ngramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) { AKLOGE("The word is too long to insert the ngram to the dictionary. " "length: %zd", ngramProperty->getTargetCodePoints()->size()); return false; } WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray; const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); if (prevWordIds.empty()) { return false; } for (size_t i = 0; i < prevWordIds.size(); ++i) { if (prevWordIds[i] != NOT_A_WORD_ID) { continue; } if (!ngramContext->isNthPrevWordBeginningOfSentence(i + 1 /* n */)) { return false; } const UnigramProperty beginningOfSentenceUnigramProperty( true /* representsBeginningOfSentence */, true /* isNotAWord */, false /* isBlacklisted */, false /* isPossiblyOffensive */, MAX_PROBABILITY /* probability */, HistoricalInfo()); if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */), &beginningOfSentenceUnigramProperty)) { AKLOGE("Cannot add unigram entry for the beginning-of-sentence."); return false; } // Refresh word ids. ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); } const int wordId = getWordId(CodePointArrayView(*ngramProperty->getTargetCodePoints()), false /* forceLowerCaseSearch */); if (wordId == NOT_A_WORD_ID) { return false; } bool addedNewEntry = false; if (mNodeWriter.addNgramEntry(prevWordIds, wordId, ngramProperty, &addedNewEntry)) { if (addedNewEntry) { mEntryCounters.incrementNgramCount( NgramUtils::getNgramTypeFromWordCount(prevWordIds.size() + 1)); } return true; } else { return false; } }
bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePoints, const UnigramProperty *const unigramProperty) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary."); return false; } if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", mDictBuffer->getTailPosition()); return false; } if (wordCodePoints.size() > MAX_WORD_LENGTH) { AKLOGE("The word is too long to insert to the dictionary, length: %zd", wordCodePoints.size()); return false; } for (const auto &shortcut : unigramProperty->getShortcuts()) { if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) { AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %zd", shortcut.getTargetCodePoints()->size()); return false; } } DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); readingHelper.initWithPtNodeArrayPos(getRootPosition()); bool addedNewUnigram = false; int codePointsToAdd[MAX_WORD_LENGTH]; int codePointCountToAdd = wordCodePoints.size(); memmove(codePointsToAdd, wordCodePoints.data(), sizeof(int) * codePointCountToAdd); if (unigramProperty->representsBeginningOfSentence()) { codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd, codePointCountToAdd, MAX_WORD_LENGTH); } if (codePointCountToAdd <= 0) { return false; } const CodePointArrayView codePointArrayView(codePointsToAdd, codePointCountToAdd); if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty, &addedNewUnigram)) { if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) { mEntryCounters.incrementNgramCount(NgramType::Unigram); } if (unigramProperty->getShortcuts().size() > 0) { // Add shortcut target. const int wordId = getWordId(codePointArrayView, false /* forceLowerCaseSearch */); if (wordId == NOT_A_WORD_ID) { AKLOGE("Cannot find word id to add shortcut target."); return false; } const int wordPos = mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); for (const auto &shortcut : unigramProperty->getShortcuts()) { if (!mUpdatingHelper.addShortcutTarget(wordPos, CodePointArrayView(*shortcut.getTargetCodePoints()), shortcut.getProbability())) { AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %zd, " "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(), shortcut.getProbability()); return false; } } } return true; } else { return false; } }
// TODO: Remove the dependency of "isGeometric" void ProximityInfoState::initInputParams(const int pointerId, const float maxPointToKeyLength, const ProximityInfo *proximityInfo, const int *const inputCodes, const int inputSize, const int *const xCoordinates, const int *const yCoordinates, const int *const times, const int *const pointerIds, const bool isGeometric) { ASSERT(isGeometric || (inputSize < MAX_WORD_LENGTH)); mIsContinuousSuggestionPossible = (mHasBeenUpdatedByGeometricInput != isGeometric) ? false : ProximityInfoStateUtils::checkAndReturnIsContinuousSuggestionPossible( inputSize, xCoordinates, yCoordinates, times, mSampledInputSize, &mSampledInputXs, &mSampledInputYs, &mSampledTimes, &mSampledInputIndice); if (DEBUG_DICT) { AKLOGI("isContinuousSuggestionPossible = %s", (mIsContinuousSuggestionPossible ? "true" : "false")); } mProximityInfo = proximityInfo; mHasTouchPositionCorrectionData = proximityInfo->hasTouchPositionCorrectionData(); mMostCommonKeyWidthSquare = proximityInfo->getMostCommonKeyWidthSquare(); mKeyCount = proximityInfo->getKeyCount(); mCellHeight = proximityInfo->getCellHeight(); mCellWidth = proximityInfo->getCellWidth(); mGridHeight = proximityInfo->getGridWidth(); mGridWidth = proximityInfo->getGridHeight(); memset(mInputProximities, 0, sizeof(mInputProximities)); if (!isGeometric && pointerId == 0) { mProximityInfo->initializeProximities(inputCodes, xCoordinates, yCoordinates, inputSize, mInputProximities); } /////////////////////// // Setup touch points int pushTouchPointStartIndex = 0; int lastSavedInputSize = 0; mMaxPointToKeyLength = maxPointToKeyLength; mSampledInputSize = 0; mMostProbableStringProbability = 0.0f; if (mIsContinuousSuggestionPossible && mSampledInputIndice.size() > 1) { // Just update difference. // Previous two points are never skipped. Thus, we pop 2 input point data here. pushTouchPointStartIndex = ProximityInfoStateUtils::trimLastTwoTouchPoints( &mSampledInputXs, &mSampledInputYs, &mSampledTimes, &mSampledLengthCache, &mSampledInputIndice); lastSavedInputSize = mSampledInputXs.size(); } else { // Clear all data. mSampledInputXs.clear(); mSampledInputYs.clear(); mSampledTimes.clear(); mSampledInputIndice.clear(); mSampledLengthCache.clear(); mSampledNormalizedSquaredLengthCache.clear(); mSampledNearKeySets.clear(); mSampledSearchKeySets.clear(); mSpeedRates.clear(); mBeelineSpeedPercentiles.clear(); mCharProbabilities.clear(); mDirections.clear(); } if (DEBUG_GEO_FULL) { AKLOGI("Init ProximityInfoState: reused points = %d, last input size = %d", pushTouchPointStartIndex, lastSavedInputSize); } if (xCoordinates && yCoordinates) { mSampledInputSize = ProximityInfoStateUtils::updateTouchPoints(mProximityInfo, mMaxPointToKeyLength, mInputProximities, xCoordinates, yCoordinates, times, pointerIds, inputSize, isGeometric, pointerId, pushTouchPointStartIndex, &mSampledInputXs, &mSampledInputYs, &mSampledTimes, &mSampledLengthCache, &mSampledInputIndice); } if (mSampledInputSize > 0 && isGeometric) { mAverageSpeed = ProximityInfoStateUtils::refreshSpeedRates(inputSize, xCoordinates, yCoordinates, times, lastSavedInputSize, mSampledInputSize, &mSampledInputXs, &mSampledInputYs, &mSampledTimes, &mSampledLengthCache, &mSampledInputIndice, &mSpeedRates, &mDirections); ProximityInfoStateUtils::refreshBeelineSpeedRates(mProximityInfo->getMostCommonKeyWidth(), mAverageSpeed, inputSize, xCoordinates, yCoordinates, times, mSampledInputSize, &mSampledInputXs, &mSampledInputYs, &mSampledInputIndice, &mBeelineSpeedPercentiles); } if (mSampledInputSize > 0) { ProximityInfoStateUtils::initGeometricDistanceInfos(mProximityInfo, mSampledInputSize, lastSavedInputSize, isGeometric, &mSampledInputXs, &mSampledInputYs, &mSampledNearKeySets, &mSampledNormalizedSquaredLengthCache); if (isGeometric) { // updates probabilities of skipping or mapping each key for all points. ProximityInfoStateUtils::updateAlignPointProbabilities( mMaxPointToKeyLength, mProximityInfo->getMostCommonKeyWidth(), mProximityInfo->getKeyCount(), lastSavedInputSize, mSampledInputSize, &mSampledInputXs, &mSampledInputYs, &mSpeedRates, &mSampledLengthCache, &mSampledNormalizedSquaredLengthCache, &mSampledNearKeySets, &mCharProbabilities); ProximityInfoStateUtils::updateSampledSearchKeySets(mProximityInfo, mSampledInputSize, lastSavedInputSize, &mSampledLengthCache, &mSampledNearKeySets, &mSampledSearchKeySets, &mSampledSearchKeyVectors); mMostProbableStringProbability = ProximityInfoStateUtils::getMostProbableString( mProximityInfo, mSampledInputSize, &mCharProbabilities, mMostProbableString); } } if (DEBUG_SAMPLING_POINTS) { ProximityInfoStateUtils::dump(isGeometric, inputSize, xCoordinates, yCoordinates, mSampledInputSize, &mSampledInputXs, &mSampledInputYs, &mSampledTimes, &mSpeedRates, &mBeelineSpeedPercentiles); } // end /////////////////////// mTouchPositionCorrectionEnabled = mSampledInputSize > 0 && mHasTouchPositionCorrectionData && xCoordinates && yCoordinates; if (!isGeometric && pointerId == 0) { ProximityInfoStateUtils::initPrimaryInputWord( inputSize, mInputProximities, mPrimaryInputWord); } if (DEBUG_GEO_FULL) { AKLOGI("ProximityState init finished: %d points out of %d", mSampledInputSize, inputSize); } mHasBeenUpdatedByGeometricInput = isGeometric; }
/** * Expands the dicNodes in the current search priority queue by advancing to the possible child * nodes based on the next touch point(s) (or no touch points for lookahead) */ void Suggest::expandCurrentDicNodes(DicTraverseSession *traverseSession) const { const int inputSize = traverseSession->getInputSize(); DicNodeVector childDicNodes(TRAVERSAL->getDefaultExpandDicNodeSize()); DicNode correctionDicNode; // TODO: Find more efficient caching const bool shouldDepthLevelCache = TRAVERSAL->shouldDepthLevelCache(traverseSession); if (shouldDepthLevelCache) { traverseSession->getDicTraverseCache()->updateLastCachedInputIndex(); } if (DEBUG_CACHE) { AKLOGI("expandCurrentDicNodes depth level cache = %d, inputSize = %d", shouldDepthLevelCache, inputSize); } while (traverseSession->getDicTraverseCache()->activeSize() > 0) { DicNode dicNode; traverseSession->getDicTraverseCache()->popActive(&dicNode); if (dicNode.isTotalInputSizeExceedingLimit()) { return; } childDicNodes.clear(); const int point0Index = dicNode.getInputIndex(0); const bool canDoLookAheadCorrection = TRAVERSAL->canDoLookAheadCorrection(traverseSession, &dicNode); const bool isLookAheadCorrection = canDoLookAheadCorrection && traverseSession->getDicTraverseCache()-> isLookAheadCorrectionInputIndex(static_cast<int>(point0Index)); const bool isCompletion = dicNode.isCompletion(inputSize); const bool shouldNodeLevelCache = TRAVERSAL->shouldNodeLevelCache(traverseSession, &dicNode); if (shouldDepthLevelCache || shouldNodeLevelCache) { if (DEBUG_CACHE) { dicNode.dump("PUSH_CACHE"); } traverseSession->getDicTraverseCache()->copyPushContinue(&dicNode); dicNode.setCached(); } if (dicNode.isInDigraph()) { // Finish digraph handling if the node is in the middle of a digraph expansion. processDicNodeAsDigraph(traverseSession, &dicNode); } else if (isLookAheadCorrection) { // The algorithm maintains a small set of "deferred" nodes that have not consumed the // latest touch point yet. These are needed to apply look-ahead correction operations // that require special handling of the latest touch point. For example, with insertions // (e.g., "thiis" -> "this") the latest touch point should not be consumed at all. processDicNodeAsTransposition(traverseSession, &dicNode); processDicNodeAsInsertion(traverseSession, &dicNode); } else { // !isLookAheadCorrection // Only consider typing error corrections if the normalized compound distance is // below a spatial distance threshold. // NOTE: the threshold may need to be updated if scoring model changes. // TODO: Remove. Do not prune node here. const bool allowsErrorCorrections = TRAVERSAL->allowsErrorCorrections(&dicNode); // Process for handling space substitution (e.g., hevis => he is) if (TRAVERSAL->isSpaceSubstitutionTerminal(traverseSession, &dicNode)) { createNextWordDicNode(traverseSession, &dicNode, true /* spaceSubstitution */); } DicNodeUtils::getAllChildDicNodes( &dicNode, traverseSession->getDictionaryStructurePolicy(), &childDicNodes); const int childDicNodesSize = childDicNodes.getSizeAndLock(); for (int i = 0; i < childDicNodesSize; ++i) { DicNode *const childDicNode = childDicNodes[i]; if (isCompletion) { // Handle forward lookahead when the lexicon letter exceeds the input size. processDicNodeAsMatch(traverseSession, childDicNode); continue; } if (DigraphUtils::hasDigraphForCodePoint( traverseSession->getDictionaryStructurePolicy() ->getHeaderStructurePolicy(), childDicNode->getNodeCodePoint())) { correctionDicNode.initByCopy(childDicNode); correctionDicNode.advanceDigraphIndex(); processDicNodeAsDigraph(traverseSession, &correctionDicNode); } if (TRAVERSAL->isOmission(traverseSession, &dicNode, childDicNode, allowsErrorCorrections)) { // TODO: (Gesture) Change weight between omission and substitution errors // TODO: (Gesture) Terminal node should not be handled as omission correctionDicNode.initByCopy(childDicNode); processDicNodeAsOmission(traverseSession, &correctionDicNode); } const ProximityType proximityType = TRAVERSAL->getProximityType( traverseSession, &dicNode, childDicNode); switch (proximityType) { // TODO: Consider the difference of proximityType here case MATCH_CHAR: case PROXIMITY_CHAR: processDicNodeAsMatch(traverseSession, childDicNode); break; case ADDITIONAL_PROXIMITY_CHAR: if (allowsErrorCorrections) { processDicNodeAsAdditionalProximityChar(traverseSession, &dicNode, childDicNode); } break; case SUBSTITUTION_CHAR: if (allowsErrorCorrections) { processDicNodeAsSubstitution(traverseSession, &dicNode, childDicNode); } break; case UNRELATED_CHAR: // Just drop this dicNode and do nothing. break; default: // Just drop this dicNode and do nothing. break; } } // Push the dicNode for look-ahead correction if (allowsErrorCorrections && canDoLookAheadCorrection) { traverseSession->getDicTraverseCache()->copyPushNextActive(&dicNode); } } } }