void ConnectionAutomaton::onError(folly::exception_wrapper ex) { onTerminal(std::move(ex)); }
// ProcessCurrentNode returns a boolean telling whether to traverse children nodes or not. // If the return value is false, then the caller should read in the output "nextSiblingPosition" // to find out the address of the next sibling node and pass it to a new call of processCurrentNode. // It is worthy to note that when false is returned, the output values other than // nextSiblingPosition are undefined. // If the return value is true, then the caller must proceed to traverse the children of this // node. processCurrentNode will output the information about the children: their count in // newCount, their position in newChildrenPosition, the traverseAllNodes flag in // newTraverseAllNodes, the match weight into newMatchRate, the input index into newInputIndex, the // diffs into newDiffs, the sibling position in nextSiblingPosition, and the output index into // newOutputIndex. Please also note the following caveat: processCurrentNode does not know when // there aren't any more nodes at this level, it merely returns the address of the first byte after // the current node in nextSiblingPosition. Thus, the caller must keep count of the nodes at any // given level, as output into newCount when traversing this level's parent. inline bool UnigramDictionary::processCurrentNode(const int initialPos, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, Correction *correction, int *newCount, int *newChildrenPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool, const int currentWordIndex) { if (DEBUG_DICT) { correction->checkState(); } int pos = initialPos; // Flags contain the following information: // - Address type (MASK_GROUP_ADDRESS_TYPE) on two bits: // - FLAG_GROUP_ADDRESS_TYPE_{ONE,TWO,THREE}_BYTES means there are children and their address // is on the specified number of bytes. // - FLAG_GROUP_ADDRESS_TYPE_NOADDRESS means there are no children, and therefore no address. // - FLAG_HAS_MULTIPLE_CHARS: whether this node has multiple char or not. // - FLAG_IS_TERMINAL: whether this node is a terminal or not (it may still have children) // - FLAG_HAS_BIGRAMS: whether this node has bigrams or not const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(DICT_ROOT, &pos); const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags)); const bool isTerminalNode = (0 != (FLAG_IS_TERMINAL & flags)); bool needsToInvokeOnTerminal = false; // This gets only ONE character from the stream. Next there will be: // if FLAG_HAS_MULTIPLE CHARS: the other characters of the same node // else if FLAG_IS_TERMINAL: the frequency // else if MASK_GROUP_ADDRESS_TYPE is not NONE: the children address // Note that you can't have a node that both is not a terminal and has no children. int32_t c = BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos); assert(NOT_A_CHARACTER != c); // We are going to loop through each character and make it look like it's a different // node each time. To do that, we will process characters in this node in order until // we find the character terminator. This is signalled by getCharCode* returning // NOT_A_CHARACTER. // As a special case, if there is only one character in this node, we must not read the // next bytes so we will simulate the NOT_A_CHARACTER return by testing the flags. // This way, each loop run will look like a "virtual node". do { // We prefetch the next char. If 'c' is the last char of this node, we will have // NOT_A_CHARACTER in the next char. From this we can decide whether this virtual node // should behave as a terminal or not and whether we have children. const int32_t nextc = hasMultipleChars ? BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos) : NOT_A_CHARACTER; const bool isLastChar = (NOT_A_CHARACTER == nextc); // If there are more chars in this nodes, then this virtual node is not a terminal. // If we are on the last char, this virtual node is a terminal if this node is. const bool isTerminal = isLastChar && isTerminalNode; Correction::CorrectionType stateType = correction->processCharAndCalcState( c, isTerminal); if (stateType == Correction::TRAVERSE_ALL_ON_TERMINAL || stateType == Correction::ON_TERMINAL) { needsToInvokeOnTerminal = true; } else if (stateType == Correction::UNRELATED || correction->needsToPrune()) { // We found that this is an unrelated character, so we should give up traversing // this node and its children entirely. // However we may not be on the last virtual node yet so we skip the remaining // characters in this node, the frequency if it's there, read the next sibling // position to output it, then return false. // We don't have to output other values because we return false, as in // "don't traverse children". if (!isLastChar) { pos = BinaryFormat::skipOtherCharacters(DICT_ROOT, pos); } pos = BinaryFormat::skipFrequency(flags, pos); *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); return false; } // Prepare for the next character. Promote the prefetched char to current char - the loop // will take care of prefetching the next. If we finally found our last char, nextc will // contain NOT_A_CHARACTER. c = nextc; } while (NOT_A_CHARACTER != c); if (isTerminalNode) { // The frequency should be here, because we come here only if this is actually // a terminal node, and we are on its last char. const int unigramFreq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); const int childrenAddressPos = BinaryFormat::skipFrequency(flags, pos); const int attributesPos = BinaryFormat::skipChildrenPosition(flags, childrenAddressPos); TerminalAttributes terminalAttributes(DICT_ROOT, flags, attributesPos); // bigramMap contains the bigram frequencies indexed by addresses for fast lookup. // bigramFilter is a bloom filter of said frequencies for even faster rejection. const int probability = BinaryFormat::getProbability(initialPos, bigramMap, bigramFilter, unigramFreq); onTerminal(probability, terminalAttributes, correction, queuePool, needsToInvokeOnTerminal, currentWordIndex); // If there are more chars in this node, then this virtual node has children. // If we are on the last char, this virtual node has children if this node has. const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags); // This character matched the typed character (enough to traverse the node at least) // so we just evaluated it. Now we should evaluate this virtual node's children - that // is, if it has any. If it has no children, we're done here - so we skip the end of // the node, output the siblings position, and return false "don't traverse children". // Note that !hasChildren implies isLastChar, so we know we don't have to skip any // remaining char in this group for there can't be any. if (!hasChildren) { pos = BinaryFormat::skipFrequency(flags, pos); *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); return false; } // Optimization: Prune out words that are too long compared to how much was typed. if (correction->needsToPrune()) { pos = BinaryFormat::skipFrequency(flags, pos); *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); if (DEBUG_DICT_FULL) { AKLOGI("Traversing was pruned."); } return false; } } // Now we finished processing this node, and we want to traverse children. If there are no // children, we can't come here. assert(BinaryFormat::hasChildrenInFlags(flags)); // If this node was a terminal it still has the frequency under the pointer (it may have been // read, but not skipped - see readFrequencyWithoutMovingPointer). // Next come the children position, then possibly attributes (attributes are bigrams only for // now, maybe something related to shortcuts in the future). // Once this is read, we still need to output the number of nodes in the immediate children of // this node, so we read and output it before returning true, as in "please traverse children". pos = BinaryFormat::skipFrequency(flags, pos); int childrenPos = BinaryFormat::readChildrenPosition(DICT_ROOT, flags, pos); *nextSiblingPosition = BinaryFormat::skipChildrenPosAndAttributes(DICT_ROOT, flags, pos); *newCount = BinaryFormat::getGroupCountAndForwardPointer(DICT_ROOT, &childrenPos); *newChildrenPosition = childrenPos; return true; }
void ConnectionAutomaton::onComplete() { onTerminal(folly::exception_wrapper()); }