bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext( const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints, const bool isValidWord, const HistoricalInfo historicalInfo) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable " "dictionary."); return false; } const bool updateAsAValidWord = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */) ? false : isValidWord; int wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */); if (wordId == NOT_A_WORD_ID) { // The word is not in the dictionary. const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */, false /* isNotAWord */, false /* isBlacklisted */, false /* isPossiblyOffensive */, NOT_A_PROBABILITY, HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */)); if (!addUnigramEntry(wordCodePoints, &unigramProperty)) { AKLOGE("Cannot add unigarm entry in updateEntriesForWordWithNgramContext()."); return false; } if (!isValidWord) { return true; } wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */); } WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray; const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) { if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) { const UnigramProperty beginningOfSentenceUnigramProperty( true /* representsBeginningOfSentence */, true /* isNotAWord */, false /* isPossiblyOffensive */, NOT_A_PROBABILITY, HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */)); if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */), &beginningOfSentenceUnigramProperty)) { AKLOGE("Cannot add BoS entry in updateEntriesForWordWithNgramContext()."); return false; } // Refresh word ids. ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); } // Update entries for beginning of sentence. if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord( prevWordIds.skip(1 /* n */), prevWordIds[0], true /* isVaild */, historicalInfo, mHeaderPolicy, &mEntryCounters)) { return false; } } if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord(prevWordIds, wordId, updateAsAValidWord, historicalInfo, mHeaderPolicy, &mEntryCounters)) { return false; } return true; }
bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary."); return false; } if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", mDictBuffer->getTailPosition()); return false; } if (!ngramContext->isValid()) { AKLOGE("Ngram context is not valid for removing n-gram entry form the dictionary."); return false; } if (wordCodePoints.size() > MAX_WORD_LENGTH) { AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd", wordCodePoints.size()); } WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray; const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSerch */); if (prevWordIds.empty() || prevWordIds.contains(NOT_A_WORD_ID)) { return false; } const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); if (wordId == NOT_A_WORD_ID) { return false; } if (mNodeWriter.removeNgramEntry(prevWordIds, wordId)) { mEntryCounters.decrementNgramCount( NgramUtils::getNgramTypeFromWordCount(prevWordIds.size() + 1)); return true; } else { return false; } }
bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCodePoints) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary."); return false; } const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); if (wordId == NOT_A_WORD_ID) { return false; } const int ptNodePos = mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); if (!mNodeWriter.markPtNodeAsDeleted(&ptNodeParams)) { AKLOGE("Cannot remove unigram. ptNodePos: %d", ptNodePos); return false; } if (!mBuffers->getMutableLanguageModelDictContent()->removeProbabilityEntry(wordId)) { return false; } if (!ptNodeParams.representsNonWordInfo()) { mEntryCounters.decrementNgramCount(NgramType::Unigram); } return true; }
bool TTconcept::checkWordId3() const { return isWordClass(WC_ABSTRACT) || isWordClass(WC_ADJECTIVE) || (isWordClass(WC_ADVERB) && getWordId() != 910); }
const WordProperty Ver4PatriciaTriePolicy::getWordProperty( const CodePointArrayView wordCodePoints) const { const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); if (wordId == NOT_A_WORD_ID) { AKLOGE("getWordProperty is called for invalid word."); return WordProperty(); } const LanguageModelDictContent *const languageModelDictContent = mBuffers->getLanguageModelDictContent(); // Fetch ngram information. std::vector<NgramProperty> ngrams; int ngramTargetCodePoints[MAX_WORD_LENGTH]; int ngramPrevWordsCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH]; int ngramPrevWordsCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; bool ngramPrevWordIsBeginningOfSentense[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; for (const auto entry : languageModelDictContent->exportAllNgramEntriesRelatedToWord( mHeaderPolicy, wordId)) { const int codePointCount = getCodePointsAndReturnCodePointCount(entry.getTargetWordId(), MAX_WORD_LENGTH, ngramTargetCodePoints); const WordIdArrayView prevWordIds = entry.getPrevWordIds(); for (size_t i = 0; i < prevWordIds.size(); ++i) { ngramPrevWordsCodePointCount[i] = getCodePointsAndReturnCodePointCount(prevWordIds[i], MAX_WORD_LENGTH, ngramPrevWordsCodePoints[i]); ngramPrevWordIsBeginningOfSentense[i] = languageModelDictContent->getProbabilityEntry( prevWordIds[i]).representsBeginningOfSentence(); if (ngramPrevWordIsBeginningOfSentense[i]) { ngramPrevWordsCodePointCount[i] = CharUtils::removeBeginningOfSentenceMarker( ngramPrevWordsCodePoints[i], ngramPrevWordsCodePointCount[i]); } } const NgramContext ngramContext(ngramPrevWordsCodePoints, ngramPrevWordsCodePointCount, ngramPrevWordIsBeginningOfSentense, prevWordIds.size()); const ProbabilityEntry ngramProbabilityEntry = entry.getProbabilityEntry(); const HistoricalInfo *const historicalInfo = ngramProbabilityEntry.getHistoricalInfo(); // TODO: Output flags in WordAttributes. ngrams.emplace_back(ngramContext, CodePointArrayView(ngramTargetCodePoints, codePointCount).toVector(), entry.getWordAttributes().getProbability(), *historicalInfo); } // Fetch shortcut information. std::vector<UnigramProperty::ShortcutProperty> shortcuts; int shortcutPos = getShortcutPositionOfWord(wordId); if (shortcutPos != NOT_A_DICT_POS) { int shortcutTarget[MAX_WORD_LENGTH]; const ShortcutDictContent *const shortcutDictContent = mBuffers->getShortcutDictContent(); bool hasNext = true; while (hasNext) { int shortcutTargetLength = 0; int shortcutProbability = NOT_A_PROBABILITY; shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget, &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos); shortcuts.emplace_back( CodePointArrayView(shortcutTarget, shortcutTargetLength).toVector(), shortcutProbability); } } const WordAttributes wordAttributes = languageModelDictContent->getWordAttributes( WordIdArrayView(), wordId, true /* mustMatchAllPrevWords */, mHeaderPolicy); const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry(wordId); const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(), wordAttributes.isNotAWord(), wordAttributes.isBlacklisted(), wordAttributes.isPossiblyOffensive(), wordAttributes.getProbability(), *historicalInfo, std::move(shortcuts)); return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams); }
bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramProperty) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary."); return false; } if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", mDictBuffer->getTailPosition()); return false; } const NgramContext *const ngramContext = ngramProperty->getNgramContext(); if (!ngramContext->isValid()) { AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary."); return false; } if (ngramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) { AKLOGE("The word is too long to insert the ngram to the dictionary. " "length: %zd", ngramProperty->getTargetCodePoints()->size()); return false; } WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray; const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); if (prevWordIds.empty()) { return false; } for (size_t i = 0; i < prevWordIds.size(); ++i) { if (prevWordIds[i] != NOT_A_WORD_ID) { continue; } if (!ngramContext->isNthPrevWordBeginningOfSentence(i + 1 /* n */)) { return false; } const UnigramProperty beginningOfSentenceUnigramProperty( true /* representsBeginningOfSentence */, true /* isNotAWord */, false /* isBlacklisted */, false /* isPossiblyOffensive */, MAX_PROBABILITY /* probability */, HistoricalInfo()); if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */), &beginningOfSentenceUnigramProperty)) { AKLOGE("Cannot add unigram entry for the beginning-of-sentence."); return false; } // Refresh word ids. ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); } const int wordId = getWordId(CodePointArrayView(*ngramProperty->getTargetCodePoints()), false /* forceLowerCaseSearch */); if (wordId == NOT_A_WORD_ID) { return false; } bool addedNewEntry = false; if (mNodeWriter.addNgramEntry(prevWordIds, wordId, ngramProperty, &addedNewEntry)) { if (addedNewEntry) { mEntryCounters.incrementNgramCount( NgramUtils::getNgramTypeFromWordCount(prevWordIds.size() + 1)); } return true; } else { return false; } }
bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePoints, const UnigramProperty *const unigramProperty) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary."); return false; } if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", mDictBuffer->getTailPosition()); return false; } if (wordCodePoints.size() > MAX_WORD_LENGTH) { AKLOGE("The word is too long to insert to the dictionary, length: %zd", wordCodePoints.size()); return false; } for (const auto &shortcut : unigramProperty->getShortcuts()) { if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) { AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %zd", shortcut.getTargetCodePoints()->size()); return false; } } DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); readingHelper.initWithPtNodeArrayPos(getRootPosition()); bool addedNewUnigram = false; int codePointsToAdd[MAX_WORD_LENGTH]; int codePointCountToAdd = wordCodePoints.size(); memmove(codePointsToAdd, wordCodePoints.data(), sizeof(int) * codePointCountToAdd); if (unigramProperty->representsBeginningOfSentence()) { codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd, codePointCountToAdd, MAX_WORD_LENGTH); } if (codePointCountToAdd <= 0) { return false; } const CodePointArrayView codePointArrayView(codePointsToAdd, codePointCountToAdd); if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty, &addedNewUnigram)) { if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) { mEntryCounters.incrementNgramCount(NgramType::Unigram); } if (unigramProperty->getShortcuts().size() > 0) { // Add shortcut target. const int wordId = getWordId(codePointArrayView, false /* forceLowerCaseSearch */); if (wordId == NOT_A_WORD_ID) { AKLOGE("Cannot find word id to add shortcut target."); return false; } const int wordPos = mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); for (const auto &shortcut : unigramProperty->getShortcuts()) { if (!mUpdatingHelper.addShortcutTarget(wordPos, CodePointArrayView(*shortcut.getTargetCodePoints()), shortcut.getProbability())) { AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %zd, " "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(), shortcut.getProbability()); return false; } } } return true; } else { return false; } }