bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext(
        const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints,
        const bool isValidWord, const HistoricalInfo historicalInfo) {
    if (!mBuffers->isUpdatable()) {
        AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable "
                "dictionary.");
        return false;
    }
    const bool updateAsAValidWord = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */) ?
            false : isValidWord;
    int wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */);
    if (wordId == NOT_A_WORD_ID) {
        // The word is not in the dictionary.
        const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */,
                false /* isNotAWord */, false /* isBlacklisted */, false /* isPossiblyOffensive */,
                NOT_A_PROBABILITY, HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */,
                0 /* count */));
        if (!addUnigramEntry(wordCodePoints, &unigramProperty)) {
            AKLOGE("Cannot add unigarm entry in updateEntriesForWordWithNgramContext().");
            return false;
        }
        if (!isValidWord) {
            return true;
        }
        wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */);
    }

    WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
    const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
            false /* tryLowerCaseSearch */);
    if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) {
        if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) {
            const UnigramProperty beginningOfSentenceUnigramProperty(
                    true /* representsBeginningOfSentence */,
                    true /* isNotAWord */, false /* isPossiblyOffensive */, NOT_A_PROBABILITY,
                    HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */));
            if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */),
                    &beginningOfSentenceUnigramProperty)) {
                AKLOGE("Cannot add BoS entry in updateEntriesForWordWithNgramContext().");
                return false;
            }
            // Refresh word ids.
            ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */);
        }
        // Update entries for beginning of sentence.
        if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord(
                prevWordIds.skip(1 /* n */), prevWordIds[0], true /* isVaild */, historicalInfo,
                mHeaderPolicy, &mEntryCounters)) {
            return false;
        }
    }
    if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord(prevWordIds,
            wordId, updateAsAValidWord, historicalInfo, mHeaderPolicy, &mEntryCounters)) {
        return false;
    }
    return true;
}
bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramContext,
        const CodePointArrayView wordCodePoints) {
    if (!mBuffers->isUpdatable()) {
        AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary.");
        return false;
    }
    if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
        AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
                mDictBuffer->getTailPosition());
        return false;
    }
    if (!ngramContext->isValid()) {
        AKLOGE("Ngram context is not valid for removing n-gram entry form the dictionary.");
        return false;
    }
    if (wordCodePoints.size() > MAX_WORD_LENGTH) {
        AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd",
                wordCodePoints.size());
    }
    WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
    const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
            false /* tryLowerCaseSerch */);
    if (prevWordIds.empty() || prevWordIds.contains(NOT_A_WORD_ID)) {
        return false;
    }
    const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */);
    if (wordId == NOT_A_WORD_ID) {
        return false;
    }
    if (mNodeWriter.removeNgramEntry(prevWordIds, wordId)) {
        mEntryCounters.decrementNgramCount(
                NgramUtils::getNgramTypeFromWordCount(prevWordIds.size() + 1));
        return true;
    } else {
        return false;
    }
}
bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCodePoints) {
    if (!mBuffers->isUpdatable()) {
        AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary.");
        return false;
    }
    const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */);
    if (wordId == NOT_A_WORD_ID) {
        return false;
    }
    const int ptNodePos =
            mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
    const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
    if (!mNodeWriter.markPtNodeAsDeleted(&ptNodeParams)) {
        AKLOGE("Cannot remove unigram. ptNodePos: %d", ptNodePos);
        return false;
    }
    if (!mBuffers->getMutableLanguageModelDictContent()->removeProbabilityEntry(wordId)) {
        return false;
    }
    if (!ptNodeParams.representsNonWordInfo()) {
        mEntryCounters.decrementNgramCount(NgramType::Unigram);
    }
    return true;
}
コード例 #4
0
ファイル: tt_concept.cpp プロジェクト: OmerMor/scummvm
bool TTconcept::checkWordId3() const {
	return isWordClass(WC_ABSTRACT) || isWordClass(WC_ADJECTIVE) ||
		(isWordClass(WC_ADVERB) && getWordId() != 910);
}
const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
        const CodePointArrayView wordCodePoints) const {
    const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */);
    if (wordId == NOT_A_WORD_ID) {
        AKLOGE("getWordProperty is called for invalid word.");
        return WordProperty();
    }
    const LanguageModelDictContent *const languageModelDictContent =
            mBuffers->getLanguageModelDictContent();
    // Fetch ngram information.
    std::vector<NgramProperty> ngrams;
    int ngramTargetCodePoints[MAX_WORD_LENGTH];
    int ngramPrevWordsCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH];
    int ngramPrevWordsCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
    bool ngramPrevWordIsBeginningOfSentense[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
    for (const auto entry : languageModelDictContent->exportAllNgramEntriesRelatedToWord(
            mHeaderPolicy, wordId)) {
        const int codePointCount = getCodePointsAndReturnCodePointCount(entry.getTargetWordId(),
                MAX_WORD_LENGTH, ngramTargetCodePoints);
        const WordIdArrayView prevWordIds = entry.getPrevWordIds();
        for (size_t i = 0; i < prevWordIds.size(); ++i) {
            ngramPrevWordsCodePointCount[i] = getCodePointsAndReturnCodePointCount(prevWordIds[i],
                       MAX_WORD_LENGTH, ngramPrevWordsCodePoints[i]);
            ngramPrevWordIsBeginningOfSentense[i] = languageModelDictContent->getProbabilityEntry(
                    prevWordIds[i]).representsBeginningOfSentence();
            if (ngramPrevWordIsBeginningOfSentense[i]) {
                ngramPrevWordsCodePointCount[i] = CharUtils::removeBeginningOfSentenceMarker(
                        ngramPrevWordsCodePoints[i], ngramPrevWordsCodePointCount[i]);
            }
        }
        const NgramContext ngramContext(ngramPrevWordsCodePoints, ngramPrevWordsCodePointCount,
                ngramPrevWordIsBeginningOfSentense, prevWordIds.size());
        const ProbabilityEntry ngramProbabilityEntry = entry.getProbabilityEntry();
        const HistoricalInfo *const historicalInfo = ngramProbabilityEntry.getHistoricalInfo();
        // TODO: Output flags in WordAttributes.
        ngrams.emplace_back(ngramContext,
                CodePointArrayView(ngramTargetCodePoints, codePointCount).toVector(),
                entry.getWordAttributes().getProbability(), *historicalInfo);
    }
    // Fetch shortcut information.
    std::vector<UnigramProperty::ShortcutProperty> shortcuts;
    int shortcutPos = getShortcutPositionOfWord(wordId);
    if (shortcutPos != NOT_A_DICT_POS) {
        int shortcutTarget[MAX_WORD_LENGTH];
        const ShortcutDictContent *const shortcutDictContent =
                mBuffers->getShortcutDictContent();
        bool hasNext = true;
        while (hasNext) {
            int shortcutTargetLength = 0;
            int shortcutProbability = NOT_A_PROBABILITY;
            shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget,
                    &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos);
            shortcuts.emplace_back(
                    CodePointArrayView(shortcutTarget, shortcutTargetLength).toVector(),
                    shortcutProbability);
        }
    }
    const WordAttributes wordAttributes = languageModelDictContent->getWordAttributes(
            WordIdArrayView(), wordId, true /* mustMatchAllPrevWords */, mHeaderPolicy);
    const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry(wordId);
    const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
    const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(),
            wordAttributes.isNotAWord(), wordAttributes.isBlacklisted(),
            wordAttributes.isPossiblyOffensive(), wordAttributes.getProbability(),
            *historicalInfo, std::move(shortcuts));
    return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams);
}
bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramProperty) {
    if (!mBuffers->isUpdatable()) {
        AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
        return false;
    }
    if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
        AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
                mDictBuffer->getTailPosition());
        return false;
    }
    const NgramContext *const ngramContext = ngramProperty->getNgramContext();
    if (!ngramContext->isValid()) {
        AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary.");
        return false;
    }
    if (ngramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
        AKLOGE("The word is too long to insert the ngram to the dictionary. "
                "length: %zd", ngramProperty->getTargetCodePoints()->size());
        return false;
    }
    WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
    const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
            false /* tryLowerCaseSearch */);
    if (prevWordIds.empty()) {
        return false;
    }
    for (size_t i = 0; i < prevWordIds.size(); ++i) {
        if (prevWordIds[i] != NOT_A_WORD_ID) {
            continue;
        }
        if (!ngramContext->isNthPrevWordBeginningOfSentence(i + 1 /* n */)) {
            return false;
        }
        const UnigramProperty beginningOfSentenceUnigramProperty(
                true /* representsBeginningOfSentence */, true /* isNotAWord */,
                false /* isBlacklisted */, false /* isPossiblyOffensive */,
                MAX_PROBABILITY /* probability */, HistoricalInfo());
        if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */),
                &beginningOfSentenceUnigramProperty)) {
            AKLOGE("Cannot add unigram entry for the beginning-of-sentence.");
            return false;
        }
        // Refresh word ids.
        ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */);
    }
    const int wordId = getWordId(CodePointArrayView(*ngramProperty->getTargetCodePoints()),
            false /* forceLowerCaseSearch */);
    if (wordId == NOT_A_WORD_ID) {
        return false;
    }
    bool addedNewEntry = false;
    if (mNodeWriter.addNgramEntry(prevWordIds, wordId, ngramProperty, &addedNewEntry)) {
        if (addedNewEntry) {
            mEntryCounters.incrementNgramCount(
                    NgramUtils::getNgramTypeFromWordCount(prevWordIds.size() + 1));
        }
        return true;
    } else {
        return false;
    }
}
bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePoints,
        const UnigramProperty *const unigramProperty) {
    if (!mBuffers->isUpdatable()) {
        AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary.");
        return false;
    }
    if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
        AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
                mDictBuffer->getTailPosition());
        return false;
    }
    if (wordCodePoints.size() > MAX_WORD_LENGTH) {
        AKLOGE("The word is too long to insert to the dictionary, length: %zd",
                wordCodePoints.size());
        return false;
    }
    for (const auto &shortcut : unigramProperty->getShortcuts()) {
        if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
            AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %zd",
                    shortcut.getTargetCodePoints()->size());
            return false;
        }
    }
    DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
    readingHelper.initWithPtNodeArrayPos(getRootPosition());
    bool addedNewUnigram = false;
    int codePointsToAdd[MAX_WORD_LENGTH];
    int codePointCountToAdd = wordCodePoints.size();
    memmove(codePointsToAdd, wordCodePoints.data(), sizeof(int) * codePointCountToAdd);
    if (unigramProperty->representsBeginningOfSentence()) {
        codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd,
                codePointCountToAdd, MAX_WORD_LENGTH);
    }
    if (codePointCountToAdd <= 0) {
        return false;
    }
    const CodePointArrayView codePointArrayView(codePointsToAdd, codePointCountToAdd);
    if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty,
            &addedNewUnigram)) {
        if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) {
            mEntryCounters.incrementNgramCount(NgramType::Unigram);
        }
        if (unigramProperty->getShortcuts().size() > 0) {
            // Add shortcut target.
            const int wordId = getWordId(codePointArrayView, false /* forceLowerCaseSearch */);
            if (wordId == NOT_A_WORD_ID) {
                AKLOGE("Cannot find word id to add shortcut target.");
                return false;
            }
            const int wordPos =
                    mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
            for (const auto &shortcut : unigramProperty->getShortcuts()) {
                if (!mUpdatingHelper.addShortcutTarget(wordPos,
                        CodePointArrayView(*shortcut.getTargetCodePoints()),
                        shortcut.getProbability())) {
                    AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %zd, "
                            "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(),
                            shortcut.getProbability());
                    return false;
                }
            }
        }
        return true;
    } else {
        return false;
    }
}