static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz,
        jlong dict, jintArray word, jboolean isBeginningOfSentence, jintArray outCodePoints,
        jbooleanArray outFlags, jintArray outProbabilityInfo, jobject outBigramTargets,
        jobject outBigramProbabilityInfo, jobject outShortcutTargets,
        jobject outShortcutProbabilities) {
    Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
    if (!dictionary) return;
    const jsize wordLength = env->GetArrayLength(word);
    if (wordLength > MAX_WORD_LENGTH) {
        AKLOGE("Invalid wordLength: %d", wordLength);
        return;
    }
    int wordCodePoints[MAX_WORD_LENGTH];
    env->GetIntArrayRegion(word, 0, wordLength, wordCodePoints);
    int codePointCount = wordLength;
    if (isBeginningOfSentence) {
        codePointCount = CharUtils::attachBeginningOfSentenceMarker(
                wordCodePoints, wordLength, MAX_WORD_LENGTH);
        if (codePointCount < 0) {
            AKLOGE("Cannot attach Beginning-of-Sentence marker.");
            return;
        }
    }
    const WordProperty wordProperty = dictionary->getWordProperty(wordCodePoints, codePointCount);
    wordProperty.outputProperties(env, outCodePoints, outFlags, outProbabilityInfo,
            outBigramTargets, outBigramProbabilityInfo, outShortcutTargets,
            outShortcutProbabilities);
}
/* static */ bool DictFileWritingUtils::flushAllHeaderAndBodyToFile(const char *const filePath,
        BufferWithExtendableBuffer *const dictHeader, BufferWithExtendableBuffer *const dictBody) {
    const int tmpFileNameBufSize = strlen(filePath)
            + strlen(TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE) + 1 /* terminator */;
    // Name of a temporary file used for writing that is a connected string of original name and
    // TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE.
    char tmpFileName[tmpFileNameBufSize];
    snprintf(tmpFileName, tmpFileNameBufSize, "%s%s", filePath,
            TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE);
    FILE *const file = fopen(tmpFileName, "wb");
    if (!file) {
        AKLOGE("Dictionary file %s cannnot be opened.", tmpFileName);
        ASSERT(false);
        return false;
    }
    // Write the dictionary header.
    if (!writeBufferToFile(file, dictHeader)) {
        remove(tmpFileName);
        AKLOGE("Dictionary header cannnot be written. size: %d", dictHeader->getTailPosition());
        ASSERT(false);
        return false;
    }
    // Write the dictionary body.
    if (!writeBufferToFile(file, dictBody)) {
        remove(tmpFileName);
        AKLOGE("Dictionary body cannnot be written. size: %d", dictBody->getTailPosition());
        ASSERT(false);
        return false;
    }
    fclose(file);
    rename(tmpFileName, filePath);
    return true;
}
コード例 #3
0
bool ShortcutDictContent::runGC(
        const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
        const ShortcutDictContent *const originalShortcutDictContent) {
   for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin();
           it != terminalIdMap->end(); ++it) {
       const int originalShortcutListPos =
               originalShortcutDictContent->getShortcutListHeadPos(it->first);
       if (originalShortcutListPos == NOT_A_DICT_POS) {
           continue;
       }
       const int shortcutListPos = getContentBuffer()->getTailPosition();
       // Copy shortcut list from original content.
       if (!copyShortcutListFromDictContent(originalShortcutListPos, originalShortcutDictContent,
               shortcutListPos)) {
           AKLOGE("Cannot copy shortcut list during GC. original pos: %d, pos: %d",
                   originalShortcutListPos, shortcutListPos);
           return false;
       }
       // Set shortcut list position to the lookup table.
       if (!getUpdatableAddressLookupTable()->set(it->second, shortcutListPos)) {
           AKLOGE("Cannot set shortcut list position. terminal id: %d, pos: %d",
                   it->second, shortcutListPos);
           return false;
       }
   }
   return true;
}
/* static */ bool DictFileWritingUtils::flushBufferToFile(const char *const filePath,
        const BufferWithExtendableBuffer *const buffer) {
    const int fd = open(filePath, O_WRONLY | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
    if (fd == -1) {
        AKLOGE("File %s cannot be opened. errno: %d", filePath, errno);
        ASSERT(false);
        return false;
    }
    FILE *const file = fdopen(fd, "wb");
    if (!file) {
        AKLOGE("fdopen failed for the file %s. errno: %d", filePath, errno);
        ASSERT(false);
        return false;
    }
    if (!writeBufferToFile(file, buffer)) {
        fclose(file);
        remove(filePath);
        AKLOGE("Buffer cannot be written to the file %s. size: %d", filePath,
                buffer->getTailPosition());
        ASSERT(false);
        return false;
    }
    fclose(file);
    return true;
}
bool Ver4PtNodeArrayReader::readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos,
        int *const outPtNodeCount, int *const outFirstPtNodePos) const {
    if (ptNodeArrayPos < 0 || ptNodeArrayPos >= mBuffer->getTailPosition()) {
        // Reading invalid position because of a bug or a broken dictionary.
        AKLOGE("Reading PtNode array info from invalid dictionary position: %d, dict size: %d",
                ptNodeArrayPos, mBuffer->getTailPosition());
        ASSERT(false);
        return false;
    }
    const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodeArrayPos);
    const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer);
    int readingPos = ptNodeArrayPos;
    if (usesAdditionalBuffer) {
        readingPos -= mBuffer->getOriginalBufferSize();
    }
    const int ptNodeCountInArray = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
            dictBuf, &readingPos);
    if (usesAdditionalBuffer) {
        readingPos += mBuffer->getOriginalBufferSize();
    }
    if (ptNodeCountInArray < 0) {
        AKLOGE("Invalid PtNode count in an array: %d.", ptNodeCountInArray);
        return false;
    }
    *outPtNodeCount = ptNodeCountInArray;
    *outFirstPtNodePos = readingPos;
    return true;
}
bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC(
        const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode) {
    if (!toBeUpdatedPtNodeParams->isTerminal()) {
        AKLOGE("updatePtNodeProbabilityAndGetNeedsToSaveForGC is called for non-terminal PtNode.");
        return false;
    }
    const ProbabilityEntry originalProbabilityEntry =
            mBuffers->getLanguageModelDictContent()->getProbabilityEntry(
                    toBeUpdatedPtNodeParams->getTerminalId());
    if (originalProbabilityEntry.hasHistoricalInfo()) {
        const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave(
                originalProbabilityEntry.getHistoricalInfo(), mHeaderPolicy);
        const ProbabilityEntry probabilityEntry =
                originalProbabilityEntry.createEntryWithUpdatedHistoricalInfo(&historicalInfo);
        if (!mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry(
                toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry)) {
            AKLOGE("Cannot write updated probability entry. terminalId: %d",
                    toBeUpdatedPtNodeParams->getTerminalId());
            return false;
        }
        const bool isValid = ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy);
        if (!isValid) {
            if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) {
                AKLOGE("Cannot mark PtNode as willBecomeNonTerminal.");
                return false;
            }
        }
        *outNeedsToKeepPtNode = isValid;
    } else {
        // No need to update probability.
        *outNeedsToKeepPtNode = true;
    }
    return true;
}
bool ProbabilityDictContent::writeEntry(const ProbabilityEntry *const probabilityEntry,
        const int entryPos) {
    BufferWithExtendableBuffer *const bufferToWrite = getWritableBuffer();
    int writingPos = entryPos;
    if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getFlags(),
            Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &writingPos)) {
        AKLOGE("Cannot write flags in probability dict content. pos: %d", writingPos);
        return false;
    }
    if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getProbability(),
            Ver4DictConstants::PROBABILITY_SIZE, &writingPos)) {
        AKLOGE("Cannot write probability in probability dict content. pos: %d", writingPos);
        return false;
    }
    if (mHasHistoricalInfo) {
        const HistoricalInfo *const historicalInfo = probabilityEntry->getHistoricalInfo();
        if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getTimestamp(),
                Ver4DictConstants::TIME_STAMP_FIELD_SIZE, &writingPos)) {
            AKLOGE("Cannot write timestamp in probability dict content. pos: %d", writingPos);
            return false;
        }
        if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getLevel(),
                Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, &writingPos)) {
            AKLOGE("Cannot write level in probability dict content. pos: %d", writingPos);
            return false;
        }
        if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getCount(),
                Ver4DictConstants::WORD_COUNT_FIELD_SIZE, &writingPos)) {
            AKLOGE("Cannot write count in probability dict content. pos: %d", writingPos);
            return false;
        }
    }
    return true;
}
/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr
        DictionaryStructureWithBufferPolicyFactory::newPolicyForFileDict(
                const char *const path, const int bufOffset, const int size) {
    // Allocated buffer in MmapedBuffer::openBuffer() will be freed in the destructor of
    // MmappedBufferPtr if the instance has the responsibility.
    MmappedBuffer::MmappedBufferPtr mmappedBuffer(
            MmappedBuffer::openBuffer(path, bufOffset, size, false /* isUpdatable */));
    if (!mmappedBuffer) {
        return nullptr;
    }
    switch (FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView())) {
        case FormatUtils::VERSION_2:
        case FormatUtils::VERSION_201:
            AKLOGE("Dictionary versions 2 and 201 are incompatible with this version");
            break;
        case FormatUtils::VERSION_202:
            return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(
                    new PatriciaTriePolicy(std::move(mmappedBuffer)));
        case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
        case FormatUtils::VERSION_402:
        case FormatUtils::VERSION_403:
            AKLOGE("Given path is a file but the format is version 4. path: %s", path);
            break;
        default:
            AKLOGE("DICT: dictionary format is unknown, bad magic number. path: %s", path);
            break;
    }
    ASSERT(false);
    return nullptr;
}
コード例 #9
0
bool BigramDictContent::runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
        const BigramDictContent *const originalBigramDictContent,
        int *const outBigramEntryCount) {
    for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin();
            it != terminalIdMap->end(); ++it) {
        const int originalBigramListPos =
                originalBigramDictContent->getBigramListHeadPos(it->first);
        if (originalBigramListPos == NOT_A_DICT_POS) {
            // This terminal does not have a bigram list.
            continue;
        }
        const int bigramListPos = getContentBuffer()->getTailPosition();
        int bigramEntryCount = 0;
        // Copy bigram list with GC from original content.
        if (!runGCBigramList(originalBigramListPos, originalBigramDictContent, bigramListPos,
                terminalIdMap, &bigramEntryCount)) {
            AKLOGE("Cannot complete GC for the bigram list. original pos: %d, pos: %d",
                    originalBigramListPos, bigramListPos);
            return false;
        }
        if (bigramEntryCount == 0) {
            // All bigram entries are useless. This terminal does not have a bigram list.
            continue;
        }
        *outBigramEntryCount += bigramEntryCount;
        // Set bigram list position to the lookup table.
        if (!getUpdatableAddressLookupTable()->set(it->second, bigramListPos)) {
            AKLOGE("Cannot set bigram list position. terminal id: %d, pos: %d",
                    it->second, bigramListPos);
            return false;
        }
    }
    return true;
}
コード例 #10
0
/* static */ Ver4DictBuffers::Ver4DictBuffersPtr Ver4DictBuffers::openVer4DictBuffers(
        const char *const dictPath, MmappedBuffer::MmappedBufferPtr &&headerBuffer,
        const FormatUtils::FORMAT_VERSION formatVersion) {
    if (!headerBuffer) {
        ASSERT(false);
        AKLOGE("The header buffer must be valid to open ver4 dict buffers.");
        return Ver4DictBuffersPtr(nullptr);
    }
    // TODO: take only dictDirPath, and open both header and trie files in the constructor below
    const bool isUpdatable = headerBuffer->isUpdatable();
    MmappedBuffer::MmappedBufferPtr bodyBuffer = MmappedBuffer::openBuffer(dictPath,
            Ver4DictConstants::BODY_FILE_EXTENSION, isUpdatable);
    if (!bodyBuffer) {
        return Ver4DictBuffersPtr(nullptr);
    }
    std::vector<ReadWriteByteArrayView> buffers;
    const ReadWriteByteArrayView buffer = bodyBuffer->getReadWriteByteArrayView();
    int position = 0;
    while (position < static_cast<int>(buffer.size())) {
        const int bufferSize = ByteArrayUtils::readUint32AndAdvancePosition(
                buffer.data(), &position);
        buffers.push_back(buffer.subView(position, bufferSize));
        position += bufferSize;
        if (bufferSize < 0 || position < 0 || position > static_cast<int>(buffer.size())) {
            AKLOGE("The dict body file is corrupted.");
            return Ver4DictBuffersPtr(nullptr);
        }
    }
    if (buffers.size() != Ver4DictConstants::NUM_OF_CONTENT_BUFFERS_IN_BODY_FILE) {
        AKLOGE("The dict body file is corrupted.");
        return Ver4DictBuffersPtr(nullptr);
    }
    return Ver4DictBuffersPtr(new Ver4DictBuffers(std::move(headerBuffer), std::move(bodyBuffer),
            formatVersion, buffers));
}
/* static */ MmappedBuffer::MmappedBufferPtr MmappedBuffer::openBuffer(
        const char *const path, const int bufferOffset, const int bufferSize,
        const bool isUpdatable) {
    const int mmapFd = open(path, O_RDONLY);
    if (mmapFd < 0) {
        AKLOGE("DICT: Can't open the source. path=%s errno=%d", path, errno);
        return nullptr;
    }
    const int pagesize = sysconf(_SC_PAGESIZE);
    const int offset = bufferOffset % pagesize;
    int alignedOffset = bufferOffset - offset;
    int alignedSize = bufferSize + offset;
    const int protMode = isUpdatable ? PROT_READ | PROT_WRITE : PROT_READ;
    void *const mmappedBuffer = mmap(0, alignedSize, protMode, MAP_PRIVATE, mmapFd,
            alignedOffset);
    if (mmappedBuffer == MAP_FAILED) {
        AKLOGE("DICT: Can't mmap dictionary. errno=%d", errno);
        close(mmapFd);
        return nullptr;
    }
    uint8_t *const buffer = static_cast<uint8_t *>(mmappedBuffer) + offset;
    if (!buffer) {
        AKLOGE("DICT: buffer is null");
        close(mmapFd);
        return nullptr;
    }
    return MmappedBufferPtr(new MmappedBuffer(buffer, bufferSize, mmappedBuffer, alignedSize,
            mmapFd, isUpdatable));
}
bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext(
        const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints,
        const bool isValidWord, const HistoricalInfo historicalInfo) {
    if (!mBuffers->isUpdatable()) {
        AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable "
                "dictionary.");
        return false;
    }
    const bool updateAsAValidWord = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */) ?
            false : isValidWord;
    int wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */);
    if (wordId == NOT_A_WORD_ID) {
        // The word is not in the dictionary.
        const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */,
                false /* isNotAWord */, false /* isBlacklisted */, false /* isPossiblyOffensive */,
                NOT_A_PROBABILITY, HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */,
                0 /* count */));
        if (!addUnigramEntry(wordCodePoints, &unigramProperty)) {
            AKLOGE("Cannot add unigarm entry in updateEntriesForWordWithNgramContext().");
            return false;
        }
        if (!isValidWord) {
            return true;
        }
        wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */);
    }

    WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
    const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
            false /* tryLowerCaseSearch */);
    if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) {
        if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) {
            const UnigramProperty beginningOfSentenceUnigramProperty(
                    true /* representsBeginningOfSentence */,
                    true /* isNotAWord */, false /* isPossiblyOffensive */, NOT_A_PROBABILITY,
                    HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */));
            if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */),
                    &beginningOfSentenceUnigramProperty)) {
                AKLOGE("Cannot add BoS entry in updateEntriesForWordWithNgramContext().");
                return false;
            }
            // Refresh word ids.
            ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */);
        }
        // Update entries for beginning of sentence.
        if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord(
                prevWordIds.skip(1 /* n */), prevWordIds[0], true /* isVaild */, historicalInfo,
                mHeaderPolicy, &mEntryCounters)) {
            return false;
        }
    }
    if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord(prevWordIds,
            wordId, updateAsAValidWord, historicalInfo, mHeaderPolicy, &mEntryCounters)) {
        return false;
    }
    return true;
}
MmappedBuffer::~MmappedBuffer() {
    if (mAlignedSize == 0) {
        return;
    }
    int ret = munmap(mMmappedBuffer, mAlignedSize);
    if (ret != 0) {
        AKLOGE("DICT: Failure in munmap. ret=%d errno=%d", ret, errno);
    }
    ret = close(mMmapFd);
    if (ret != 0) {
        AKLOGE("DICT: Failure in close. ret=%d errno=%d", ret, errno);
    }
}
static void releaseDictBuf(const void *dictBuf, const size_t length, const int fd) {
#ifdef USE_MMAP_FOR_DICTIONARY
    int ret = munmap(const_cast<void *>(dictBuf), length);
    if (ret != 0) {
        AKLOGE("DICT: Failure in munmap. ret=%d errno=%d", ret, errno);
    }
    ret = close(fd);
    if (ret != 0) {
        AKLOGE("DICT: Failure in close. ret=%d errno=%d", ret, errno);
    }
#else // USE_MMAP_FOR_DICTIONARY
    free(const_cast<void *>(dictBuf));
#endif // USE_MMAP_FOR_DICTIONARY
}
// Follow the forward link and read the next node array if exists.
void DynamicPatriciaTrieReadingHelper::followForwardLink() {
    if (mReadingState.mPos < 0 || mReadingState.mPos >= mBuffer->getTailPosition()) {
        // Reading invalid position because of bug or broken dictionary.
        AKLOGE("Reading forward link from invalid dictionary position: %d, dict size: %d",
                mReadingState.mPos, mBuffer->getTailPosition());
        ASSERT(false);
        mIsError = true;
        mReadingState.mPos = NOT_A_DICT_POS;
        return;
    }
    const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(mReadingState.mPos);
    const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer);
    if (usesAdditionalBuffer) {
        mReadingState.mPos -= mBuffer->getOriginalBufferSize();
    }
    const int forwardLinkPosition =
            DynamicPatriciaTrieReadingUtils::getForwardLinkPosition(dictBuf, mReadingState.mPos);
    if (usesAdditionalBuffer) {
        mReadingState.mPos += mBuffer->getOriginalBufferSize();
    }
    mReadingState.mPosOfLastForwardLinkField = mReadingState.mPos;
    if (DynamicPatriciaTrieReadingUtils::isValidForwardLinkPosition(forwardLinkPosition)) {
        // Follow the forward link.
        mReadingState.mPos += forwardLinkPosition;
        nextPtNodeArray();
    } else {
        // All node arrays have been read.
        mReadingState.mPos = NOT_A_DICT_POS;
    }
}
コード例 #16
0
bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const int length0,
        const int *const word1, const int length1) {
    if (!mBuffer->isUpdatable()) {
        AKLOGI("Warning: removeBigramWords() is called for non-updatable dictionary.");
        return false;
    }
    if (mBufferWithExtendableBuffer.getTailPosition()
            >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
        AKLOGE("The dictionary is too large to dynamically update.");
        return false;
    }
    const int word0Pos = getTerminalNodePositionOfWord(word0, length0,
            false /* forceLowerCaseSearch */);
    if (word0Pos == NOT_A_DICT_POS) {
        return false;
    }
    const int word1Pos = getTerminalNodePositionOfWord(word1, length1,
            false /* forceLowerCaseSearch */);
    if (word1Pos == NOT_A_DICT_POS) {
        return false;
    }
    DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
            &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict());
    if (writingHelper.removeBigramWords(word0Pos, word1Pos)) {
        mBigramCount--;
        return true;
    } else {
        return false;
    }
}
/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr
        DictionaryStructureWithBufferPolicyFactory:: newPolicyForOnMemoryDict(
                const int formatVersion, const std::vector<int> &locale,
                const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) {
    FormatUtils::FORMAT_VERSION dictFormatVersion = FormatUtils::getFormatVersion(formatVersion);
    switch (dictFormatVersion) {
        case FormatUtils::VERSION_402: {
            return newPolicyForOnMemoryV4Dict<backward::v402::Ver4DictConstants,
                    backward::v402::Ver4DictBuffers,
                    backward::v402::Ver4DictBuffers::Ver4DictBuffersPtr,
                    backward::v402::Ver4PatriciaTriePolicy>(
                            dictFormatVersion, locale, attributeMap);
        }
        case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
        case FormatUtils::VERSION_403: {
            return newPolicyForOnMemoryV4Dict<Ver4DictConstants, Ver4DictBuffers,
                    Ver4DictBuffers::Ver4DictBuffersPtr, Ver4PatriciaTriePolicy>(
                            dictFormatVersion, locale, attributeMap);
        }
        default:
            AKLOGE("DICT: dictionary format %d is not supported for on memory dictionary",
                    formatVersion);
            break;
    }
    return nullptr;
}
コード例 #18
0
void ShortcutDictContent::getShortcutEntryAndAdvancePosition(const int maxCodePointCount,
        int *const outCodePoint, int *const outCodePointCount, int *const outProbability,
        bool *const outhasNext, int *const shortcutEntryPos) const {
    const BufferWithExtendableBuffer *const shortcutListBuffer = getContentBuffer();
    if (*shortcutEntryPos < 0 || *shortcutEntryPos >=  shortcutListBuffer->getTailPosition()) {
        AKLOGE("Invalid shortcut entry position. shortcutEntryPos: %d, bufSize: %d",
                *shortcutEntryPos, shortcutListBuffer->getTailPosition());
        ASSERT(false);
        if (outhasNext) {
            *outhasNext = false;
        }
        if (outCodePointCount) {
            *outCodePointCount = 0;
        }
        return;
    }

    const int shortcutFlags = shortcutListBuffer->readUintAndAdvancePosition(
            Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos);
    if (outProbability) {
        *outProbability = shortcutFlags & Ver4DictConstants::SHORTCUT_PROBABILITY_MASK;
    }
    if (outhasNext) {
        *outhasNext = shortcutFlags & Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK;
    }
    if (outCodePoint && outCodePointCount) {
        shortcutListBuffer->readCodePointsAndAdvancePosition(
                maxCodePointCount, outCodePoint, outCodePointCount, shortcutEntryPos);
    }
}
コード例 #19
0
bool DynamicPatriciaTriePolicy::addUnigramWord(const int *const word, const int length,
        const int probability) {
    if (!mBuffer->isUpdatable()) {
        AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary.");
        return false;
    }
    if (mBufferWithExtendableBuffer.getTailPosition()
            >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
        AKLOGE("The dictionary is too large to dynamically update.");
        return false;
    }
    DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer,
            getBigramsStructurePolicy(), getShortcutsStructurePolicy());
    readingHelper.initWithPtNodeArrayPos(getRootPosition());
    DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer,
            &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict());
    bool addedNewUnigram = false;
    if (writingHelper.addUnigramWord(&readingHelper, word, length, probability,
            &addedNewUnigram)) {
        if (addedNewUnigram) {
            mUnigramCount++;
        }
        return true;
    } else {
        return false;
    }
}
bool Ver4PatriciaTrieNodeWriter::markPtNodeAsWillBecomeNonTerminal(
        const PtNodeParams *const toBeUpdatedPtNodeParams) {
    int pos = toBeUpdatedPtNodeParams->getHeadPos();
    const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos);
    const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer);
    if (usesAdditionalBuffer) {
        pos -= mTrieBuffer->getOriginalBufferSize();
    }
    // Read original flags
    const PatriciaTrieReadingUtils::NodeFlags originalFlags =
            PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos);
    const PatriciaTrieReadingUtils::NodeFlags updatedFlags =
            DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */,
                    false /* isDeleted */, true /* willBecomeNonTerminal */);
    if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition(
            toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */)) {
        AKLOGE("Cannot update terminal position lookup table. terminal id: %d",
                toBeUpdatedPtNodeParams->getTerminalId());
        return false;
    }
    // Update flags.
    int writingPos = toBeUpdatedPtNodeParams->getHeadPos();
    return DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags,
            &writingPos);
}
bool LanguageModelDictContent::updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex,
        const int prevWordCount, const HeaderPolicy *const headerPolicy,
        const bool needsToHalveCounters, MutableEntryCounters *const outEntryCounters) {
    for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) {
        if (prevWordCount > MAX_PREV_WORD_COUNT_FOR_N_GRAM) {
            AKLOGE("Invalid prevWordCount. prevWordCount: %d, MAX_PREV_WORD_COUNT_FOR_N_GRAM: %d.",
                    prevWordCount, MAX_PREV_WORD_COUNT_FOR_N_GRAM);
            return false;
        }
        const ProbabilityEntry probabilityEntry =
                ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo);
        if (prevWordCount > 0 && probabilityEntry.isValid()
                && !mTrieMap.getRoot(entry.key()).mIsValid) {
            // The entry is related to a word that has been removed. Remove the entry.
            if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) {
                return false;
            }
            continue;
        }
        if (mHasHistoricalInfo && probabilityEntry.isValid()) {
            const HistoricalInfo *originalHistoricalInfo = probabilityEntry.getHistoricalInfo();
            if (DynamicLanguageModelProbabilityUtils::shouldRemoveEntryDuringGC(
                    *originalHistoricalInfo)) {
                // Remove the entry.
                if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) {
                    return false;
                }
                continue;
            }
            if (needsToHalveCounters) {
                const int updatedCount = originalHistoricalInfo->getCount() / 2;
                if (updatedCount == 0) {
                    // Remove the entry.
                    if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) {
                        return false;
                    }
                    continue;
                }
                const HistoricalInfo historicalInfoToSave(originalHistoricalInfo->getTimestamp(),
                        originalHistoricalInfo->getLevel(), updatedCount);
                const ProbabilityEntry updatedEntry(probabilityEntry.getFlags(),
                        &historicalInfoToSave);
                if (!mTrieMap.put(entry.key(), updatedEntry.encode(mHasHistoricalInfo),
                        bitmapEntryIndex)) {
                    return false;
                }
            }
        }
        outEntryCounters->incrementNgramCount(
                NgramUtils::getNgramTypeFromWordCount(prevWordCount + 1));
        if (!entry.hasNextLevelMap()) {
            continue;
        }
        if (!updateAllProbabilityEntriesForGCInner(entry.getNextLevelBitmapEntryIndex(),
                prevWordCount + 1, headerPolicy, needsToHalveCounters, outEntryCounters)) {
            return false;
        }
    }
    return true;
}
int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
        int *const outCodePointCount) {
    *outCodePointCount = 0;
    if (token == 0) {
        mTerminalPtNodePositionsForIteratingWords.clear();
        DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
                &mTerminalPtNodePositionsForIteratingWords);
        DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
        readingHelper.initWithPtNodeArrayPos(getRootPosition());
        readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy);
    }
    const int terminalPtNodePositionsVectorSize =
            static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size());
    if (token < 0 || token >= terminalPtNodePositionsVectorSize) {
        AKLOGE("Given token %d is invalid.", token);
        return 0;
    }
    const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
    const PtNodeParams ptNodeParams =
            mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(terminalPtNodePos);
    *outCodePointCount = getCodePointsAndReturnCodePointCount(ptNodeParams.getTerminalId(),
            MAX_WORD_LENGTH, outCodePoints);
    const int nextToken = token + 1;
    if (nextToken >= terminalPtNodePositionsVectorSize) {
        // All words have been iterated.
        mTerminalPtNodePositionsForIteratingWords.clear();
        return 0;
    }
    return nextToken;
}
static jlong latinime_BinaryDictionary_open(JNIEnv *env, jclass clazz, jstring sourceDir,
        jlong dictOffset, jlong dictSize, jboolean isUpdatable) {
    PROF_OPEN;
    PROF_START(66);
    const jsize sourceDirUtf8Length = env->GetStringUTFLength(sourceDir);
    if (sourceDirUtf8Length <= 0) {
        AKLOGE("DICT: Can't get sourceDir string");
        return 0;
    }
    char sourceDirChars[sourceDirUtf8Length + 1];
    env->GetStringUTFRegion(sourceDir, 0, env->GetStringLength(sourceDir), sourceDirChars);
    sourceDirChars[sourceDirUtf8Length] = '\0';
    DictionaryStructureWithBufferPolicy *const dictionaryStructureWithBufferPolicy =
            DictionaryStructureWithBufferPolicyFactory::newDictionaryStructureWithBufferPolicy(
                    sourceDirChars, static_cast<int>(dictOffset), static_cast<int>(dictSize),
                    isUpdatable == JNI_TRUE);
    if (!dictionaryStructureWithBufferPolicy) {
        return 0;
    }

    Dictionary *const dictionary = new Dictionary(env, dictionaryStructureWithBufferPolicy);
    PROF_END(66);
    PROF_CLOSE;
    return reinterpret_cast<jlong>(dictionary);
}
// Method to iterate all words in the dictionary for makedict.
// If token is 0, this method newly starts iterating the dictionary. This method returns 0 when
// the dictionary does not have a next word.
static jint latinime_BinaryDictionary_getNextWord(JNIEnv *env, jclass clazz,
        jlong dict, jint token, jintArray outCodePoints, jbooleanArray outIsBeginningOfSentence) {
    Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
    if (!dictionary) return 0;
    const jsize codePointBufSize = env->GetArrayLength(outCodePoints);
    if (codePointBufSize != MAX_WORD_LENGTH) {
        AKLOGE("Invalid outCodePointsLength: %d", codePointBufSize);
        ASSERT(false);
        return 0;
    }
    int wordCodePoints[codePointBufSize];
    int wordCodePointCount = 0;
    const int nextToken = dictionary->getNextWordAndNextToken(token, wordCodePoints,
            &wordCodePointCount);
    JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */,
            MAX_WORD_LENGTH /* maxLength */, wordCodePoints, wordCodePointCount,
            false /* needsNullTermination */);
    bool isBeginningOfSentence = false;
    if (wordCodePointCount > 0 && wordCodePoints[0] == CODE_POINT_BEGINNING_OF_SENTENCE) {
        isBeginningOfSentence = true;
    }
    JniDataUtils::putBooleanToArray(env, outIsBeginningOfSentence, 0 /* index */,
            isBeginningOfSentence);
    return nextToken;
}
bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
        ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
    if (!ptNodeParams->isTerminal()) {
        return true;
    }
    TerminalPositionLookupTable::TerminalIdMap::const_iterator it =
            mTerminalIdMap->find(ptNodeParams->getTerminalId());
    if (it == mTerminalIdMap->end()) {
        AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd",
                ptNodeParams->getTerminalId(), mTerminalIdMap->size());
        return false;
    }
    if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) {
        AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second);
    }
    return mPtNodeWriter->updatePtNodeHasBigramsAndShortcutTargetsFlags(ptNodeParams);
}
コード例 #26
0
bool ShortcutDictContent::writeShortcutEntryAndAdvancePosition(const int *const codePoint,
        const int codePointCount, const int probability, const bool hasNext,
        int *const shortcutEntryPos) {
    BufferWithExtendableBuffer *const shortcutListBuffer = getWritableContentBuffer();
    const int shortcutFlags = createAndGetShortcutFlags(probability, hasNext);
    if (!shortcutListBuffer->writeUintAndAdvancePosition(shortcutFlags,
            Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos)) {
        AKLOGE("Cannot write shortcut flags. flags; %x, pos: %d", shortcutFlags, *shortcutEntryPos);
        return false;
    }
    if (!shortcutListBuffer->writeCodePointsAndAdvancePosition(codePoint, codePointCount,
            true /* writesTerminator */, shortcutEntryPos)) {
        AKLOGE("Cannot write shortcut target code points. pos: %d", *shortcutEntryPos);
        return false;
    }
    return true;
}
bool Ver4PatriciaTrieNodeWriter::addNgramEntry(const WordIdArrayView prevWordIds, const int wordId,
        const BigramProperty *const bigramProperty, bool *const outAddedNewBigram) {
    if (!mBigramPolicy->addNewEntry(prevWordIds[0], wordId, bigramProperty, outAddedNewBigram)) {
        AKLOGE("Cannot add new bigram entry. terminalId: %d, targetTerminalId: %d",
                prevWordIds[0], wordId);
        return false;
    }
    return true;
}
bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptNodeParams,
        const int *const targetCodePoints, const int targetCodePointCount,
        const int shortcutProbability) {
    if (!mShortcutPolicy->addNewShortcut(ptNodeParams->getTerminalId(),
            targetCodePoints, targetCodePointCount, shortcutProbability)) {
        AKLOGE("Cannot add new shortuct entry. terminalId: %d", ptNodeParams->getTerminalId());
        return false;
    }
    return true;
}
コード例 #29
0
ファイル: dictionary.cpp プロジェクト: freiling/mojo
bool Dictionary::addUnigramEntry(const int *const word, const int length,
        const UnigramProperty *const unigramProperty) {
    if (unigramProperty->representsBeginningOfSentence()
            && !mDictionaryStructureWithBufferPolicy->getHeaderStructurePolicy()
                    ->supportsBeginningOfSentence()) {
        AKLOGE("The dictionary doesn't support Beginning-of-Sentence.");
        return false;
    }
    TimeKeeper::setCurrentTime();
    return mDictionaryStructureWithBufferPolicy->addUnigramEntry(word, length, unigramProperty);
}
bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC(
        const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode) {
    if (!toBeUpdatedPtNodeParams->isTerminal()) {
        AKLOGE("updatePtNodeProbabilityAndGetNeedsToSaveForGC is called for non-terminal PtNode.");
        return false;
    }
    const ProbabilityEntry originalProbabilityEntry =
            mBuffers->getLanguageModelDictContent()->getProbabilityEntry(
                    toBeUpdatedPtNodeParams->getTerminalId());
    if (originalProbabilityEntry.isValid()) {
        *outNeedsToKeepPtNode = true;
        return true;
    }
    if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) {
        AKLOGE("Cannot mark PtNode as willBecomeNonTerminal.");
        return false;
    }
    *outNeedsToKeepPtNode = false;
    return true;
}