static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz, jlong dict, jintArray word, jboolean isBeginningOfSentence, jintArray outCodePoints, jbooleanArray outFlags, jintArray outProbabilityInfo, jobject outBigramTargets, jobject outBigramProbabilityInfo, jobject outShortcutTargets, jobject outShortcutProbabilities) { Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict); if (!dictionary) return; const jsize wordLength = env->GetArrayLength(word); if (wordLength > MAX_WORD_LENGTH) { AKLOGE("Invalid wordLength: %d", wordLength); return; } int wordCodePoints[MAX_WORD_LENGTH]; env->GetIntArrayRegion(word, 0, wordLength, wordCodePoints); int codePointCount = wordLength; if (isBeginningOfSentence) { codePointCount = CharUtils::attachBeginningOfSentenceMarker( wordCodePoints, wordLength, MAX_WORD_LENGTH); if (codePointCount < 0) { AKLOGE("Cannot attach Beginning-of-Sentence marker."); return; } } const WordProperty wordProperty = dictionary->getWordProperty(wordCodePoints, codePointCount); wordProperty.outputProperties(env, outCodePoints, outFlags, outProbabilityInfo, outBigramTargets, outBigramProbabilityInfo, outShortcutTargets, outShortcutProbabilities); }
/* static */ bool DictFileWritingUtils::flushAllHeaderAndBodyToFile(const char *const filePath, BufferWithExtendableBuffer *const dictHeader, BufferWithExtendableBuffer *const dictBody) { const int tmpFileNameBufSize = strlen(filePath) + strlen(TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE) + 1 /* terminator */; // Name of a temporary file used for writing that is a connected string of original name and // TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE. char tmpFileName[tmpFileNameBufSize]; snprintf(tmpFileName, tmpFileNameBufSize, "%s%s", filePath, TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE); FILE *const file = fopen(tmpFileName, "wb"); if (!file) { AKLOGE("Dictionary file %s cannnot be opened.", tmpFileName); ASSERT(false); return false; } // Write the dictionary header. if (!writeBufferToFile(file, dictHeader)) { remove(tmpFileName); AKLOGE("Dictionary header cannnot be written. size: %d", dictHeader->getTailPosition()); ASSERT(false); return false; } // Write the dictionary body. if (!writeBufferToFile(file, dictBody)) { remove(tmpFileName); AKLOGE("Dictionary body cannnot be written. size: %d", dictBody->getTailPosition()); ASSERT(false); return false; } fclose(file); rename(tmpFileName, filePath); return true; }
bool ShortcutDictContent::runGC( const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, const ShortcutDictContent *const originalShortcutDictContent) { for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); it != terminalIdMap->end(); ++it) { const int originalShortcutListPos = originalShortcutDictContent->getShortcutListHeadPos(it->first); if (originalShortcutListPos == NOT_A_DICT_POS) { continue; } const int shortcutListPos = getContentBuffer()->getTailPosition(); // Copy shortcut list from original content. if (!copyShortcutListFromDictContent(originalShortcutListPos, originalShortcutDictContent, shortcutListPos)) { AKLOGE("Cannot copy shortcut list during GC. original pos: %d, pos: %d", originalShortcutListPos, shortcutListPos); return false; } // Set shortcut list position to the lookup table. if (!getUpdatableAddressLookupTable()->set(it->second, shortcutListPos)) { AKLOGE("Cannot set shortcut list position. terminal id: %d, pos: %d", it->second, shortcutListPos); return false; } } return true; }
/* static */ bool DictFileWritingUtils::flushBufferToFile(const char *const filePath, const BufferWithExtendableBuffer *const buffer) { const int fd = open(filePath, O_WRONLY | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); if (fd == -1) { AKLOGE("File %s cannot be opened. errno: %d", filePath, errno); ASSERT(false); return false; } FILE *const file = fdopen(fd, "wb"); if (!file) { AKLOGE("fdopen failed for the file %s. errno: %d", filePath, errno); ASSERT(false); return false; } if (!writeBufferToFile(file, buffer)) { fclose(file); remove(filePath); AKLOGE("Buffer cannot be written to the file %s. size: %d", filePath, buffer->getTailPosition()); ASSERT(false); return false; } fclose(file); return true; }
bool Ver4PtNodeArrayReader::readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, int *const outPtNodeCount, int *const outFirstPtNodePos) const { if (ptNodeArrayPos < 0 || ptNodeArrayPos >= mBuffer->getTailPosition()) { // Reading invalid position because of a bug or a broken dictionary. AKLOGE("Reading PtNode array info from invalid dictionary position: %d, dict size: %d", ptNodeArrayPos, mBuffer->getTailPosition()); ASSERT(false); return false; } const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodeArrayPos); const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); int readingPos = ptNodeArrayPos; if (usesAdditionalBuffer) { readingPos -= mBuffer->getOriginalBufferSize(); } const int ptNodeCountInArray = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( dictBuf, &readingPos); if (usesAdditionalBuffer) { readingPos += mBuffer->getOriginalBufferSize(); } if (ptNodeCountInArray < 0) { AKLOGE("Invalid PtNode count in an array: %d.", ptNodeCountInArray); return false; } *outPtNodeCount = ptNodeCountInArray; *outFirstPtNodePos = readingPos; return true; }
bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode) { if (!toBeUpdatedPtNodeParams->isTerminal()) { AKLOGE("updatePtNodeProbabilityAndGetNeedsToSaveForGC is called for non-terminal PtNode."); return false; } const ProbabilityEntry originalProbabilityEntry = mBuffers->getLanguageModelDictContent()->getProbabilityEntry( toBeUpdatedPtNodeParams->getTerminalId()); if (originalProbabilityEntry.hasHistoricalInfo()) { const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave( originalProbabilityEntry.getHistoricalInfo(), mHeaderPolicy); const ProbabilityEntry probabilityEntry = originalProbabilityEntry.createEntryWithUpdatedHistoricalInfo(&historicalInfo); if (!mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry( toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry)) { AKLOGE("Cannot write updated probability entry. terminalId: %d", toBeUpdatedPtNodeParams->getTerminalId()); return false; } const bool isValid = ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy); if (!isValid) { if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) { AKLOGE("Cannot mark PtNode as willBecomeNonTerminal."); return false; } } *outNeedsToKeepPtNode = isValid; } else { // No need to update probability. *outNeedsToKeepPtNode = true; } return true; }
bool ProbabilityDictContent::writeEntry(const ProbabilityEntry *const probabilityEntry, const int entryPos) { BufferWithExtendableBuffer *const bufferToWrite = getWritableBuffer(); int writingPos = entryPos; if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getFlags(), Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, &writingPos)) { AKLOGE("Cannot write flags in probability dict content. pos: %d", writingPos); return false; } if (!bufferToWrite->writeUintAndAdvancePosition(probabilityEntry->getProbability(), Ver4DictConstants::PROBABILITY_SIZE, &writingPos)) { AKLOGE("Cannot write probability in probability dict content. pos: %d", writingPos); return false; } if (mHasHistoricalInfo) { const HistoricalInfo *const historicalInfo = probabilityEntry->getHistoricalInfo(); if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getTimestamp(), Ver4DictConstants::TIME_STAMP_FIELD_SIZE, &writingPos)) { AKLOGE("Cannot write timestamp in probability dict content. pos: %d", writingPos); return false; } if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getLevel(), Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, &writingPos)) { AKLOGE("Cannot write level in probability dict content. pos: %d", writingPos); return false; } if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getCount(), Ver4DictConstants::WORD_COUNT_FIELD_SIZE, &writingPos)) { AKLOGE("Cannot write count in probability dict content. pos: %d", writingPos); return false; } } return true; }
/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr DictionaryStructureWithBufferPolicyFactory::newPolicyForFileDict( const char *const path, const int bufOffset, const int size) { // Allocated buffer in MmapedBuffer::openBuffer() will be freed in the destructor of // MmappedBufferPtr if the instance has the responsibility. MmappedBuffer::MmappedBufferPtr mmappedBuffer( MmappedBuffer::openBuffer(path, bufOffset, size, false /* isUpdatable */)); if (!mmappedBuffer) { return nullptr; } switch (FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView())) { case FormatUtils::VERSION_2: case FormatUtils::VERSION_201: AKLOGE("Dictionary versions 2 and 201 are incompatible with this version"); break; case FormatUtils::VERSION_202: return DictionaryStructureWithBufferPolicy::StructurePolicyPtr( new PatriciaTriePolicy(std::move(mmappedBuffer))); case FormatUtils::VERSION_4_ONLY_FOR_TESTING: case FormatUtils::VERSION_402: case FormatUtils::VERSION_403: AKLOGE("Given path is a file but the format is version 4. path: %s", path); break; default: AKLOGE("DICT: dictionary format is unknown, bad magic number. path: %s", path); break; } ASSERT(false); return nullptr; }
bool BigramDictContent::runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, const BigramDictContent *const originalBigramDictContent, int *const outBigramEntryCount) { for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); it != terminalIdMap->end(); ++it) { const int originalBigramListPos = originalBigramDictContent->getBigramListHeadPos(it->first); if (originalBigramListPos == NOT_A_DICT_POS) { // This terminal does not have a bigram list. continue; } const int bigramListPos = getContentBuffer()->getTailPosition(); int bigramEntryCount = 0; // Copy bigram list with GC from original content. if (!runGCBigramList(originalBigramListPos, originalBigramDictContent, bigramListPos, terminalIdMap, &bigramEntryCount)) { AKLOGE("Cannot complete GC for the bigram list. original pos: %d, pos: %d", originalBigramListPos, bigramListPos); return false; } if (bigramEntryCount == 0) { // All bigram entries are useless. This terminal does not have a bigram list. continue; } *outBigramEntryCount += bigramEntryCount; // Set bigram list position to the lookup table. if (!getUpdatableAddressLookupTable()->set(it->second, bigramListPos)) { AKLOGE("Cannot set bigram list position. terminal id: %d, pos: %d", it->second, bigramListPos); return false; } } return true; }
/* static */ Ver4DictBuffers::Ver4DictBuffersPtr Ver4DictBuffers::openVer4DictBuffers( const char *const dictPath, MmappedBuffer::MmappedBufferPtr &&headerBuffer, const FormatUtils::FORMAT_VERSION formatVersion) { if (!headerBuffer) { ASSERT(false); AKLOGE("The header buffer must be valid to open ver4 dict buffers."); return Ver4DictBuffersPtr(nullptr); } // TODO: take only dictDirPath, and open both header and trie files in the constructor below const bool isUpdatable = headerBuffer->isUpdatable(); MmappedBuffer::MmappedBufferPtr bodyBuffer = MmappedBuffer::openBuffer(dictPath, Ver4DictConstants::BODY_FILE_EXTENSION, isUpdatable); if (!bodyBuffer) { return Ver4DictBuffersPtr(nullptr); } std::vector<ReadWriteByteArrayView> buffers; const ReadWriteByteArrayView buffer = bodyBuffer->getReadWriteByteArrayView(); int position = 0; while (position < static_cast<int>(buffer.size())) { const int bufferSize = ByteArrayUtils::readUint32AndAdvancePosition( buffer.data(), &position); buffers.push_back(buffer.subView(position, bufferSize)); position += bufferSize; if (bufferSize < 0 || position < 0 || position > static_cast<int>(buffer.size())) { AKLOGE("The dict body file is corrupted."); return Ver4DictBuffersPtr(nullptr); } } if (buffers.size() != Ver4DictConstants::NUM_OF_CONTENT_BUFFERS_IN_BODY_FILE) { AKLOGE("The dict body file is corrupted."); return Ver4DictBuffersPtr(nullptr); } return Ver4DictBuffersPtr(new Ver4DictBuffers(std::move(headerBuffer), std::move(bodyBuffer), formatVersion, buffers)); }
/* static */ MmappedBuffer::MmappedBufferPtr MmappedBuffer::openBuffer( const char *const path, const int bufferOffset, const int bufferSize, const bool isUpdatable) { const int mmapFd = open(path, O_RDONLY); if (mmapFd < 0) { AKLOGE("DICT: Can't open the source. path=%s errno=%d", path, errno); return nullptr; } const int pagesize = sysconf(_SC_PAGESIZE); const int offset = bufferOffset % pagesize; int alignedOffset = bufferOffset - offset; int alignedSize = bufferSize + offset; const int protMode = isUpdatable ? PROT_READ | PROT_WRITE : PROT_READ; void *const mmappedBuffer = mmap(0, alignedSize, protMode, MAP_PRIVATE, mmapFd, alignedOffset); if (mmappedBuffer == MAP_FAILED) { AKLOGE("DICT: Can't mmap dictionary. errno=%d", errno); close(mmapFd); return nullptr; } uint8_t *const buffer = static_cast<uint8_t *>(mmappedBuffer) + offset; if (!buffer) { AKLOGE("DICT: buffer is null"); close(mmapFd); return nullptr; } return MmappedBufferPtr(new MmappedBuffer(buffer, bufferSize, mmappedBuffer, alignedSize, mmapFd, isUpdatable)); }
bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext( const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints, const bool isValidWord, const HistoricalInfo historicalInfo) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable " "dictionary."); return false; } const bool updateAsAValidWord = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */) ? false : isValidWord; int wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */); if (wordId == NOT_A_WORD_ID) { // The word is not in the dictionary. const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */, false /* isNotAWord */, false /* isBlacklisted */, false /* isPossiblyOffensive */, NOT_A_PROBABILITY, HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */)); if (!addUnigramEntry(wordCodePoints, &unigramProperty)) { AKLOGE("Cannot add unigarm entry in updateEntriesForWordWithNgramContext()."); return false; } if (!isValidWord) { return true; } wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */); } WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray; const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) { if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) { const UnigramProperty beginningOfSentenceUnigramProperty( true /* representsBeginningOfSentence */, true /* isNotAWord */, false /* isPossiblyOffensive */, NOT_A_PROBABILITY, HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */)); if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */), &beginningOfSentenceUnigramProperty)) { AKLOGE("Cannot add BoS entry in updateEntriesForWordWithNgramContext()."); return false; } // Refresh word ids. ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); } // Update entries for beginning of sentence. if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord( prevWordIds.skip(1 /* n */), prevWordIds[0], true /* isVaild */, historicalInfo, mHeaderPolicy, &mEntryCounters)) { return false; } } if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord(prevWordIds, wordId, updateAsAValidWord, historicalInfo, mHeaderPolicy, &mEntryCounters)) { return false; } return true; }
MmappedBuffer::~MmappedBuffer() { if (mAlignedSize == 0) { return; } int ret = munmap(mMmappedBuffer, mAlignedSize); if (ret != 0) { AKLOGE("DICT: Failure in munmap. ret=%d errno=%d", ret, errno); } ret = close(mMmapFd); if (ret != 0) { AKLOGE("DICT: Failure in close. ret=%d errno=%d", ret, errno); } }
static void releaseDictBuf(const void *dictBuf, const size_t length, const int fd) { #ifdef USE_MMAP_FOR_DICTIONARY int ret = munmap(const_cast<void *>(dictBuf), length); if (ret != 0) { AKLOGE("DICT: Failure in munmap. ret=%d errno=%d", ret, errno); } ret = close(fd); if (ret != 0) { AKLOGE("DICT: Failure in close. ret=%d errno=%d", ret, errno); } #else // USE_MMAP_FOR_DICTIONARY free(const_cast<void *>(dictBuf)); #endif // USE_MMAP_FOR_DICTIONARY }
// Follow the forward link and read the next node array if exists. void DynamicPatriciaTrieReadingHelper::followForwardLink() { if (mReadingState.mPos < 0 || mReadingState.mPos >= mBuffer->getTailPosition()) { // Reading invalid position because of bug or broken dictionary. AKLOGE("Reading forward link from invalid dictionary position: %d, dict size: %d", mReadingState.mPos, mBuffer->getTailPosition()); ASSERT(false); mIsError = true; mReadingState.mPos = NOT_A_DICT_POS; return; } const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(mReadingState.mPos); const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); if (usesAdditionalBuffer) { mReadingState.mPos -= mBuffer->getOriginalBufferSize(); } const int forwardLinkPosition = DynamicPatriciaTrieReadingUtils::getForwardLinkPosition(dictBuf, mReadingState.mPos); if (usesAdditionalBuffer) { mReadingState.mPos += mBuffer->getOriginalBufferSize(); } mReadingState.mPosOfLastForwardLinkField = mReadingState.mPos; if (DynamicPatriciaTrieReadingUtils::isValidForwardLinkPosition(forwardLinkPosition)) { // Follow the forward link. mReadingState.mPos += forwardLinkPosition; nextPtNodeArray(); } else { // All node arrays have been read. mReadingState.mPos = NOT_A_DICT_POS; } }
bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const int length0, const int *const word1, const int length1) { if (!mBuffer->isUpdatable()) { AKLOGI("Warning: removeBigramWords() is called for non-updatable dictionary."); return false; } if (mBufferWithExtendableBuffer.getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { AKLOGE("The dictionary is too large to dynamically update."); return false; } const int word0Pos = getTerminalNodePositionOfWord(word0, length0, false /* forceLowerCaseSearch */); if (word0Pos == NOT_A_DICT_POS) { return false; } const int word1Pos = getTerminalNodePositionOfWord(word1, length1, false /* forceLowerCaseSearch */); if (word1Pos == NOT_A_DICT_POS) { return false; } DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()); if (writingHelper.removeBigramWords(word0Pos, word1Pos)) { mBigramCount--; return true; } else { return false; } }
/* static */ DictionaryStructureWithBufferPolicy::StructurePolicyPtr DictionaryStructureWithBufferPolicyFactory:: newPolicyForOnMemoryDict( const int formatVersion, const std::vector<int> &locale, const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) { FormatUtils::FORMAT_VERSION dictFormatVersion = FormatUtils::getFormatVersion(formatVersion); switch (dictFormatVersion) { case FormatUtils::VERSION_402: { return newPolicyForOnMemoryV4Dict<backward::v402::Ver4DictConstants, backward::v402::Ver4DictBuffers, backward::v402::Ver4DictBuffers::Ver4DictBuffersPtr, backward::v402::Ver4PatriciaTriePolicy>( dictFormatVersion, locale, attributeMap); } case FormatUtils::VERSION_4_ONLY_FOR_TESTING: case FormatUtils::VERSION_403: { return newPolicyForOnMemoryV4Dict<Ver4DictConstants, Ver4DictBuffers, Ver4DictBuffers::Ver4DictBuffersPtr, Ver4PatriciaTriePolicy>( dictFormatVersion, locale, attributeMap); } default: AKLOGE("DICT: dictionary format %d is not supported for on memory dictionary", formatVersion); break; } return nullptr; }
void ShortcutDictContent::getShortcutEntryAndAdvancePosition(const int maxCodePointCount, int *const outCodePoint, int *const outCodePointCount, int *const outProbability, bool *const outhasNext, int *const shortcutEntryPos) const { const BufferWithExtendableBuffer *const shortcutListBuffer = getContentBuffer(); if (*shortcutEntryPos < 0 || *shortcutEntryPos >= shortcutListBuffer->getTailPosition()) { AKLOGE("Invalid shortcut entry position. shortcutEntryPos: %d, bufSize: %d", *shortcutEntryPos, shortcutListBuffer->getTailPosition()); ASSERT(false); if (outhasNext) { *outhasNext = false; } if (outCodePointCount) { *outCodePointCount = 0; } return; } const int shortcutFlags = shortcutListBuffer->readUintAndAdvancePosition( Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos); if (outProbability) { *outProbability = shortcutFlags & Ver4DictConstants::SHORTCUT_PROBABILITY_MASK; } if (outhasNext) { *outhasNext = shortcutFlags & Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK; } if (outCodePoint && outCodePointCount) { shortcutListBuffer->readCodePointsAndAdvancePosition( maxCodePointCount, outCodePoint, outCodePointCount, shortcutEntryPos); } }
bool DynamicPatriciaTriePolicy::addUnigramWord(const int *const word, const int length, const int probability) { if (!mBuffer->isUpdatable()) { AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary."); return false; } if (mBufferWithExtendableBuffer.getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { AKLOGE("The dictionary is too large to dynamically update."); return false; } DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); readingHelper.initWithPtNodeArrayPos(getRootPosition()); DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, &mBigramListPolicy, &mShortcutListPolicy, mHeaderPolicy.isDecayingDict()); bool addedNewUnigram = false; if (writingHelper.addUnigramWord(&readingHelper, word, length, probability, &addedNewUnigram)) { if (addedNewUnigram) { mUnigramCount++; } return true; } else { return false; } }
bool Ver4PatriciaTrieNodeWriter::markPtNodeAsWillBecomeNonTerminal( const PtNodeParams *const toBeUpdatedPtNodeParams) { int pos = toBeUpdatedPtNodeParams->getHeadPos(); const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); if (usesAdditionalBuffer) { pos -= mTrieBuffer->getOriginalBufferSize(); } // Read original flags const PatriciaTrieReadingUtils::NodeFlags originalFlags = PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); const PatriciaTrieReadingUtils::NodeFlags updatedFlags = DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */, false /* isDeleted */, true /* willBecomeNonTerminal */); if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */)) { AKLOGE("Cannot update terminal position lookup table. terminal id: %d", toBeUpdatedPtNodeParams->getTerminalId()); return false; } // Update flags. int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); return DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, &writingPos); }
bool LanguageModelDictContent::updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex, const int prevWordCount, const HeaderPolicy *const headerPolicy, const bool needsToHalveCounters, MutableEntryCounters *const outEntryCounters) { for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) { if (prevWordCount > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { AKLOGE("Invalid prevWordCount. prevWordCount: %d, MAX_PREV_WORD_COUNT_FOR_N_GRAM: %d.", prevWordCount, MAX_PREV_WORD_COUNT_FOR_N_GRAM); return false; } const ProbabilityEntry probabilityEntry = ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo); if (prevWordCount > 0 && probabilityEntry.isValid() && !mTrieMap.getRoot(entry.key()).mIsValid) { // The entry is related to a word that has been removed. Remove the entry. if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) { return false; } continue; } if (mHasHistoricalInfo && probabilityEntry.isValid()) { const HistoricalInfo *originalHistoricalInfo = probabilityEntry.getHistoricalInfo(); if (DynamicLanguageModelProbabilityUtils::shouldRemoveEntryDuringGC( *originalHistoricalInfo)) { // Remove the entry. if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) { return false; } continue; } if (needsToHalveCounters) { const int updatedCount = originalHistoricalInfo->getCount() / 2; if (updatedCount == 0) { // Remove the entry. if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) { return false; } continue; } const HistoricalInfo historicalInfoToSave(originalHistoricalInfo->getTimestamp(), originalHistoricalInfo->getLevel(), updatedCount); const ProbabilityEntry updatedEntry(probabilityEntry.getFlags(), &historicalInfoToSave); if (!mTrieMap.put(entry.key(), updatedEntry.encode(mHasHistoricalInfo), bitmapEntryIndex)) { return false; } } } outEntryCounters->incrementNgramCount( NgramUtils::getNgramTypeFromWordCount(prevWordCount + 1)); if (!entry.hasNextLevelMap()) { continue; } if (!updateAllProbabilityEntriesForGCInner(entry.getNextLevelBitmapEntryIndex(), prevWordCount + 1, headerPolicy, needsToHalveCounters, outEntryCounters)) { return false; } } return true; }
int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints, int *const outCodePointCount) { *outCodePointCount = 0; if (token == 0) { mTerminalPtNodePositionsForIteratingWords.clear(); DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( &mTerminalPtNodePositionsForIteratingWords); DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); readingHelper.initWithPtNodeArrayPos(getRootPosition()); readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy); } const int terminalPtNodePositionsVectorSize = static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size()); if (token < 0 || token >= terminalPtNodePositionsVectorSize) { AKLOGE("Given token %d is invalid.", token); return 0; } const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(terminalPtNodePos); *outCodePointCount = getCodePointsAndReturnCodePointCount(ptNodeParams.getTerminalId(), MAX_WORD_LENGTH, outCodePoints); const int nextToken = token + 1; if (nextToken >= terminalPtNodePositionsVectorSize) { // All words have been iterated. mTerminalPtNodePositionsForIteratingWords.clear(); return 0; } return nextToken; }
static jlong latinime_BinaryDictionary_open(JNIEnv *env, jclass clazz, jstring sourceDir, jlong dictOffset, jlong dictSize, jboolean isUpdatable) { PROF_OPEN; PROF_START(66); const jsize sourceDirUtf8Length = env->GetStringUTFLength(sourceDir); if (sourceDirUtf8Length <= 0) { AKLOGE("DICT: Can't get sourceDir string"); return 0; } char sourceDirChars[sourceDirUtf8Length + 1]; env->GetStringUTFRegion(sourceDir, 0, env->GetStringLength(sourceDir), sourceDirChars); sourceDirChars[sourceDirUtf8Length] = '\0'; DictionaryStructureWithBufferPolicy *const dictionaryStructureWithBufferPolicy = DictionaryStructureWithBufferPolicyFactory::newDictionaryStructureWithBufferPolicy( sourceDirChars, static_cast<int>(dictOffset), static_cast<int>(dictSize), isUpdatable == JNI_TRUE); if (!dictionaryStructureWithBufferPolicy) { return 0; } Dictionary *const dictionary = new Dictionary(env, dictionaryStructureWithBufferPolicy); PROF_END(66); PROF_CLOSE; return reinterpret_cast<jlong>(dictionary); }
// Method to iterate all words in the dictionary for makedict. // If token is 0, this method newly starts iterating the dictionary. This method returns 0 when // the dictionary does not have a next word. static jint latinime_BinaryDictionary_getNextWord(JNIEnv *env, jclass clazz, jlong dict, jint token, jintArray outCodePoints, jbooleanArray outIsBeginningOfSentence) { Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict); if (!dictionary) return 0; const jsize codePointBufSize = env->GetArrayLength(outCodePoints); if (codePointBufSize != MAX_WORD_LENGTH) { AKLOGE("Invalid outCodePointsLength: %d", codePointBufSize); ASSERT(false); return 0; } int wordCodePoints[codePointBufSize]; int wordCodePointCount = 0; const int nextToken = dictionary->getNextWordAndNextToken(token, wordCodePoints, &wordCodePointCount); JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */, MAX_WORD_LENGTH /* maxLength */, wordCodePoints, wordCodePointCount, false /* needsNullTermination */); bool isBeginningOfSentence = false; if (wordCodePointCount > 0 && wordCodePoints[0] == CODE_POINT_BEGINNING_OF_SENTENCE) { isBeginningOfSentence = true; } JniDataUtils::putBooleanToArray(env, outIsBeginningOfSentence, 0 /* index */, isBeginningOfSentence); return nextToken; }
bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { if (!ptNodeParams->isTerminal()) { return true; } TerminalPositionLookupTable::TerminalIdMap::const_iterator it = mTerminalIdMap->find(ptNodeParams->getTerminalId()); if (it == mTerminalIdMap->end()) { AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd", ptNodeParams->getTerminalId(), mTerminalIdMap->size()); return false; } if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) { AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second); } return mPtNodeWriter->updatePtNodeHasBigramsAndShortcutTargetsFlags(ptNodeParams); }
bool ShortcutDictContent::writeShortcutEntryAndAdvancePosition(const int *const codePoint, const int codePointCount, const int probability, const bool hasNext, int *const shortcutEntryPos) { BufferWithExtendableBuffer *const shortcutListBuffer = getWritableContentBuffer(); const int shortcutFlags = createAndGetShortcutFlags(probability, hasNext); if (!shortcutListBuffer->writeUintAndAdvancePosition(shortcutFlags, Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE, shortcutEntryPos)) { AKLOGE("Cannot write shortcut flags. flags; %x, pos: %d", shortcutFlags, *shortcutEntryPos); return false; } if (!shortcutListBuffer->writeCodePointsAndAdvancePosition(codePoint, codePointCount, true /* writesTerminator */, shortcutEntryPos)) { AKLOGE("Cannot write shortcut target code points. pos: %d", *shortcutEntryPos); return false; } return true; }
bool Ver4PatriciaTrieNodeWriter::addNgramEntry(const WordIdArrayView prevWordIds, const int wordId, const BigramProperty *const bigramProperty, bool *const outAddedNewBigram) { if (!mBigramPolicy->addNewEntry(prevWordIds[0], wordId, bigramProperty, outAddedNewBigram)) { AKLOGE("Cannot add new bigram entry. terminalId: %d, targetTerminalId: %d", prevWordIds[0], wordId); return false; } return true; }
bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptNodeParams, const int *const targetCodePoints, const int targetCodePointCount, const int shortcutProbability) { if (!mShortcutPolicy->addNewShortcut(ptNodeParams->getTerminalId(), targetCodePoints, targetCodePointCount, shortcutProbability)) { AKLOGE("Cannot add new shortuct entry. terminalId: %d", ptNodeParams->getTerminalId()); return false; } return true; }
bool Dictionary::addUnigramEntry(const int *const word, const int length, const UnigramProperty *const unigramProperty) { if (unigramProperty->representsBeginningOfSentence() && !mDictionaryStructureWithBufferPolicy->getHeaderStructurePolicy() ->supportsBeginningOfSentence()) { AKLOGE("The dictionary doesn't support Beginning-of-Sentence."); return false; } TimeKeeper::setCurrentTime(); return mDictionaryStructureWithBufferPolicy->addUnigramEntry(word, length, unigramProperty); }
bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode) { if (!toBeUpdatedPtNodeParams->isTerminal()) { AKLOGE("updatePtNodeProbabilityAndGetNeedsToSaveForGC is called for non-terminal PtNode."); return false; } const ProbabilityEntry originalProbabilityEntry = mBuffers->getLanguageModelDictContent()->getProbabilityEntry( toBeUpdatedPtNodeParams->getTerminalId()); if (originalProbabilityEntry.isValid()) { *outNeedsToKeepPtNode = true; return true; } if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) { AKLOGE("Cannot mark PtNode as willBecomeNonTerminal."); return false; } *outNeedsToKeepPtNode = false; return true; }