bool Dict::AcceptableChoice(const WERD_CHOICE& best_choice, XHeightConsistencyEnum xheight_consistency) { float CertaintyThreshold = stopper_nondict_certainty_base; int WordSize; if (stopper_no_acceptable_choices) return false; if (best_choice.length() == 0) return false; bool no_dang_ambigs = !best_choice.dangerous_ambig_found(); bool is_valid_word = valid_word_permuter(best_choice.permuter(), false); bool is_case_ok = case_ok(best_choice, getUnicharset()); if (stopper_debug_level >= 1) { const char *xht = "UNKNOWN"; switch (xheight_consistency) { case XH_GOOD: xht = "NORMAL"; break; case XH_SUBNORMAL: xht = "SUBNORMAL"; break; case XH_INCONSISTENT: xht = "INCONSISTENT"; break; default: xht = "UNKNOWN"; } tprintf("\nStopper: %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n", best_choice.unichar_string().string(), (is_valid_word ? 'y' : 'n'), (is_case_ok ? 'y' : 'n'), xht, best_choice.min_x_height(), best_choice.max_x_height()); } // Do not accept invalid words in PASS1. if (reject_offset_ <= 0.0f && !is_valid_word) return false; if (is_valid_word && is_case_ok) { WordSize = LengthOfShortestAlphaRun(best_choice); WordSize -= stopper_smallword_size; if (WordSize < 0) WordSize = 0; CertaintyThreshold += WordSize * stopper_certainty_per_char; } if (stopper_debug_level >= 1) tprintf("Stopper: Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n", best_choice.rating(), best_choice.certainty(), CertaintyThreshold); if (no_dang_ambigs && best_choice.certainty() > CertaintyThreshold && xheight_consistency < XH_INCONSISTENT && UniformCertainties(best_choice)) { return true; } else { if (stopper_debug_level >= 1) { tprintf("AcceptableChoice() returned false" " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n", no_dang_ambigs, best_choice.certainty(), CertaintyThreshold, UniformCertainties(best_choice)); } return false; } }
bool Dict::AcceptableResult(const WERD_CHOICE &BestChoice) { float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_; int WordSize; if (stopper_debug_level >= 1) { tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c)\n", BestChoice.debug_string(getUnicharset()).string(), (valid_word(BestChoice) ? 'y' : 'n'), (case_ok(BestChoice, getUnicharset()) ? 'y' : 'n'), ((list_rest (best_choices_) != NIL_LIST) ? 'n' : 'y')); } if (BestChoice.length() == 0 || CurrentWordAmbig()) return false; if (BestChoice.fragment_mark()) { if (stopper_debug_level >= 1) { cprintf("AcceptableResult(): a choice with fragments beats BestChoice\n"); } return false; } if (valid_word(BestChoice) && case_ok(BestChoice, getUnicharset())) { WordSize = LengthOfShortestAlphaRun(BestChoice); WordSize -= stopper_smallword_size; if (WordSize < 0) WordSize = 0; CertaintyThreshold += WordSize * stopper_certainty_per_char; } if (stopper_debug_level >= 1) cprintf ("Rejecter: Certainty = %4.1f, Threshold = %4.1f ", BestChoice.certainty(), CertaintyThreshold); if (BestChoice.certainty() > CertaintyThreshold && !stopper_no_acceptable_choices) { if (stopper_debug_level >= 1) cprintf("ACCEPTED\n"); return true; } else { if (stopper_debug_level >= 1) cprintf("REJECTED\n"); return false; } }
bool Dict::AcceptableResult(WERD_RES *word) const { if (word->best_choice == nullptr) return false; float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_; int WordSize; if (stopper_debug_level >= 1) { tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n", word->best_choice->debug_string().string(), (valid_word(*word->best_choice) ? 'y' : 'n'), (case_ok(*word->best_choice, getUnicharset()) ? 'y' : 'n'), word->best_choice->dangerous_ambig_found() ? 'n' : 'y', word->best_choices.singleton() ? 'n' : 'y'); } if (word->best_choice->length() == 0 || !word->best_choices.singleton()) return false; if (valid_word(*word->best_choice) && case_ok(*word->best_choice, getUnicharset())) { WordSize = LengthOfShortestAlphaRun(*word->best_choice); WordSize -= stopper_smallword_size; if (WordSize < 0) WordSize = 0; CertaintyThreshold += WordSize * stopper_certainty_per_char; } if (stopper_debug_level >= 1) tprintf("Rejecter: Certainty = %4.1f, Threshold = %4.1f ", word->best_choice->certainty(), CertaintyThreshold); if (word->best_choice->certainty() > CertaintyThreshold && !stopper_no_acceptable_choices) { if (stopper_debug_level >= 1) tprintf("ACCEPTED\n"); return true; } else { if (stopper_debug_level >= 1) tprintf("REJECTED\n"); return false; } }
/** * dawg_permute_and_select * * Recursively explore all the possible character combinations in * the given char_choices. Use go_deeper_dawg_fxn() to search all the * dawgs in the dawgs_ vector in parallel and discard invalid words. * * Allocate and return a WERD_CHOICE with the best valid word found. */ WERD_CHOICE *Dict::dawg_permute_and_select( const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) { WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset()); best_choice->make_bad(); best_choice->set_rating(rating_limit); if (char_choices.length() == 0 || char_choices.length() > MAX_WERD_LENGTH) return best_choice; DawgPositionVector *active_dawgs = new DawgPositionVector[char_choices.length() + 1]; init_active_dawgs(&(active_dawgs[0]), true); DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM); WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH); float certainties[MAX_WERD_LENGTH]; this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn; int attempts_left = max_permuter_attempts; permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr, char_choices, 0, nullptr, &word, certainties, &rating_limit, best_choice, &attempts_left, &dawg_args); delete[] active_dawgs; return best_choice; }
/** * @name adjust_word * * Assign an adjusted value to a string that is a word. The value * that this word choice has is based on case and punctuation rules. */ void Dict::adjust_word(WERD_CHOICE *word, float *certainty_array) { float adjust_factor; float new_rating = word->rating(); if (segment_dawg_debug) { tprintf("Word: %s %4.2f ", word->debug_string(getUnicharset()).string(), word->rating()); } new_rating += RATING_PAD; if (Context::case_ok(*word, getUnicharset())) { if (freq_dawg_ != NULL && freq_dawg_->word_in_dawg(*word)) { word->set_permuter(FREQ_DAWG_PERM); new_rating *= segment_penalty_dict_frequent_word; adjust_factor = segment_penalty_dict_frequent_word; if (segment_dawg_debug) tprintf(", F, %4.2f ", (double)segment_penalty_dict_frequent_word); } else { new_rating *= segment_penalty_dict_case_ok; adjust_factor = segment_penalty_dict_case_ok; if (segment_dawg_debug) tprintf(", %4.2f ", (double)segment_penalty_dict_case_ok); } } else { new_rating *= segment_penalty_dict_case_bad; adjust_factor = segment_penalty_dict_case_bad; if (segment_dawg_debug) { tprintf(", C %4.2f ", (double)segment_penalty_dict_case_bad); } } new_rating -= RATING_PAD; word->set_rating(new_rating); LogNewChoice(*word, adjust_factor, certainty_array, false); if (segment_dawg_debug) tprintf(" --> %4.2f\n", new_rating); }
int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const { int shortest = INT32_MAX; int curr_len = 0; for (int w = 0; w < WordChoice.length(); ++w) { if (getUnicharset().get_isalpha(WordChoice.unichar_id(w))) { curr_len++; } else if (curr_len > 0) { if (curr_len < shortest) shortest = curr_len; curr_len = 0; } } if (curr_len > 0 && curr_len < shortest) { shortest = curr_len; } else if (shortest == INT32_MAX) { shortest = 0; } return shortest; }
void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings) { int num_blobs_to_replace = 0; int begin_blob_index = 0; int i; // Rating and certainty for the new BLOB_CHOICE are derived from the // replaced choices. float new_rating = 0.0f; float new_certainty = 0.0f; BLOB_CHOICE* old_choice = nullptr; for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) { if (i >= wrong_ngram_begin_index) { int num_blobs = werd_choice->state(i); int col = begin_blob_index + num_blobs_to_replace; int row = col + num_blobs - 1; BLOB_CHOICE_LIST* choices = ratings->get(col, row); ASSERT_HOST(choices != nullptr); old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices); ASSERT_HOST(old_choice != nullptr); new_rating += old_choice->rating(); new_certainty += old_choice->certainty(); num_blobs_to_replace += num_blobs; } else { begin_blob_index += werd_choice->state(i); } } new_certainty /= wrong_ngram_size; // If there is no entry in the ratings matrix, add it. MATRIX_COORD coord(begin_blob_index, begin_blob_index + num_blobs_to_replace - 1); if (!coord.Valid(*ratings)) { ratings->IncreaseBandSize(coord.row - coord.col + 1); } if (ratings->get(coord.col, coord.row) == nullptr) ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST); BLOB_CHOICE_LIST* new_choices = ratings->get(coord.col, coord.row); BLOB_CHOICE* choice = FindMatchingChoice(correct_ngram_id, new_choices); if (choice != nullptr) { // Already there. Upgrade if new rating better. if (new_rating < choice->rating()) choice->set_rating(new_rating); if (new_certainty < choice->certainty()) choice->set_certainty(new_certainty); // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState. } else { // Need a new choice with the correct_ngram_id. choice = new BLOB_CHOICE(*old_choice); choice->set_unichar_id(correct_ngram_id); choice->set_rating(new_rating); choice->set_certainty(new_certainty); choice->set_classifier(BCC_AMBIG); choice->set_matrix_cell(coord.col, coord.row); BLOB_CHOICE_IT it (new_choices); it.add_to_end(choice); } // Remove current unichar from werd_choice. On the last iteration // set the correct replacement unichar instead of removing a unichar. for (int replaced_count = 0; replaced_count < wrong_ngram_size; ++replaced_count) { if (replaced_count + 1 == wrong_ngram_size) { werd_choice->set_blob_choice(wrong_ngram_begin_index, num_blobs_to_replace, choice); } else { werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1); } } if (stopper_debug_level >= 1) { werd_choice->print("ReplaceAmbig() "); tprintf("Modified blob_choices: "); print_ratings_list("\n", new_choices, getUnicharset()); } }
bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings) { if (stopper_debug_level > 2) { tprintf("\nRunning NoDangerousAmbig() for %s\n", best_choice->debug_string().string()); } // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities // for each unichar id in BestChoice. BLOB_CHOICE_LIST_VECTOR ambig_blob_choices; int i; bool ambigs_found = false; // For each position in best_choice: // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i] // -- initialize wrong_ngram with a single unichar_id at best_choice[i] // -- look for ambiguities corresponding to wrong_ngram in the list while // adding the following unichar_ids from best_choice to wrong_ngram // // Repeat the above procedure twice: first time look through // ambigs to be replaced and replace all the ambiguities found; // second time look through dangerous ambiguities and construct // ambig_blob_choices with fake a blob choice for each ambiguity // and pass them to dawg_permute_and_select() to search for // ambiguous words in the dictionaries. // // Note that during the execution of the for loop (on the first pass) // if replacements are made the length of best_choice might change. for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) { bool replace = (fix_replaceable && pass == 0); const UnicharAmbigsVector &table = replace ? getUnicharAmbigs().replace_ambigs() : getUnicharAmbigs().dang_ambigs(); if (!replace) { // Initialize ambig_blob_choices with lists containing a single // unichar id for the correspoding position in best_choice. // best_choice consisting from only the original letters will // have a rating of 0.0. for (i = 0; i < best_choice->length(); ++i) { BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST(); BLOB_CHOICE_IT lst_it(lst); // TODO(rays/antonova) Put real xheights and y shifts here. lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i), 0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG)); ambig_blob_choices.push_back(lst); } } UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1]; int wrong_ngram_index; int next_index; int blob_index = 0; for (i = 0; i < best_choice->length(); blob_index += best_choice->state(i), ++i) { UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i); if (stopper_debug_level > 2) { tprintf("Looking for %s ngrams starting with %s:\n", replace ? "replaceable" : "ambiguous", getUnicharset().debug_str(curr_unichar_id).string()); } int num_wrong_blobs = best_choice->state(i); wrong_ngram_index = 0; wrong_ngram[wrong_ngram_index] = curr_unichar_id; if (curr_unichar_id == INVALID_UNICHAR_ID || curr_unichar_id >= table.size() || table[curr_unichar_id] == nullptr) { continue; // there is no ambig spec for this unichar id } AmbigSpec_IT spec_it(table[curr_unichar_id]); for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) { const AmbigSpec *ambig_spec = spec_it.data(); wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID; int compare = UnicharIdArrayUtils::compare(wrong_ngram, ambig_spec->wrong_ngram); if (stopper_debug_level > 2) { tprintf("candidate ngram: "); UnicharIdArrayUtils::print(wrong_ngram, getUnicharset()); tprintf("current ngram from spec: "); UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset()); tprintf("comparison result: %d\n", compare); } if (compare == 0) { // Record the place where we found an ambiguity. if (fixpt != nullptr) { UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0]; fixpt->push_back(DANGERR_INFO( blob_index, blob_index + num_wrong_blobs, replace, getUnicharset().get_isngram(ambig_spec->correct_ngram_id), leftmost_id)); if (stopper_debug_level > 1) { tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index, blob_index + num_wrong_blobs, false, getUnicharset().get_isngram( ambig_spec->correct_ngram_id), getUnicharset().id_to_unichar(leftmost_id)); } } if (replace) { if (stopper_debug_level > 2) { tprintf("replace ambiguity with %s : ", getUnicharset().id_to_unichar( ambig_spec->correct_ngram_id)); UnicharIdArrayUtils::print( ambig_spec->correct_fragments, getUnicharset()); } ReplaceAmbig(i, ambig_spec->wrong_ngram_size, ambig_spec->correct_ngram_id, best_choice, ratings); } else if (i > 0 || ambig_spec->type != CASE_AMBIG) { // We found dang ambig - update ambig_blob_choices. if (stopper_debug_level > 2) { tprintf("found ambiguity: "); UnicharIdArrayUtils::print( ambig_spec->correct_fragments, getUnicharset()); } ambigs_found = true; for (int tmp_index = 0; tmp_index <= wrong_ngram_index; ++tmp_index) { // Add a blob choice for the corresponding fragment of the // ambiguity. These fake blob choices are initialized with // negative ratings (which are not possible for real blob // choices), so that dawg_permute_and_select() considers any // word not consisting of only the original letters a better // choice and stops searching for alternatives once such a // choice is found. BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]); bc_it.add_to_end(new BLOB_CHOICE( ambig_spec->correct_fragments[tmp_index], -1.0, 0.0, -1, 0, 1, 0, BCC_AMBIG)); } } spec_it.forward(); } else if (compare == -1) { if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size && ((next_index = wrong_ngram_index+1+i) < best_choice->length())) { // Add the next unichar id to wrong_ngram and keep looking for // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST. wrong_ngram[++wrong_ngram_index] = best_choice->unichar_id(next_index); num_wrong_blobs += best_choice->state(next_index); } else { break; // no more matching ambigs in this AMBIG_SPEC_LIST } } else { spec_it.forward(); } } // end searching AmbigSpec_LIST } // end searching best_choice } // end searching replace and dangerous ambigs // If any ambiguities were found permute the constructed ambig_blob_choices // to see if an alternative dictionary word can be found. if (ambigs_found) { if (stopper_debug_level > 2) { tprintf("\nResulting ambig_blob_choices:\n"); for (i = 0; i < ambig_blob_choices.length(); ++i) { print_ratings_list("", ambig_blob_choices.get(i), getUnicharset()); tprintf("\n"); } } WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0); ambigs_found = (alt_word->rating() < 0.0); if (ambigs_found) { if (stopper_debug_level >= 1) { tprintf ("Stopper: Possible ambiguous word = %s\n", alt_word->debug_string().string()); } if (fixpt != nullptr) { // Note: Currently character choices combined from fragments can only // be generated by NoDangrousAmbigs(). This code should be updated if // the capability to produce classifications combined from character // fragments is added to other functions. int orig_i = 0; for (i = 0; i < alt_word->length(); ++i) { const UNICHARSET &uchset = getUnicharset(); bool replacement_is_ngram = uchset.get_isngram(alt_word->unichar_id(i)); UNICHAR_ID leftmost_id = alt_word->unichar_id(i); if (replacement_is_ngram) { // we have to extract the leftmost unichar from the ngram. const char *str = uchset.id_to_unichar(leftmost_id); int step = uchset.step(str); if (step) leftmost_id = uchset.unichar_to_id(str, step); } int end_i = orig_i + alt_word->state(i); if (alt_word->state(i) > 1 || (orig_i + 1 == end_i && replacement_is_ngram)) { // Compute proper blob indices. int blob_start = 0; for (int j = 0; j < orig_i; ++j) blob_start += best_choice->state(j); int blob_end = blob_start; for (int j = orig_i; j < end_i; ++j) blob_end += best_choice->state(j); fixpt->push_back(DANGERR_INFO(blob_start, blob_end, true, replacement_is_ngram, leftmost_id)); if (stopper_debug_level > 1) { tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i, true, replacement_is_ngram, uchset.id_to_unichar(leftmost_id)); } } orig_i += alt_word->state(i); } } } delete alt_word; } if (output_ambig_words_file_ != nullptr) { fprintf(output_ambig_words_file_, "\n"); } ambig_blob_choices.delete_data_pointers(); return !ambigs_found; }
// Returns true if in light of the current state the letter at word_index // in the given word is allowed according to at least one of the dawgs in // dawgs_. // // See more extensive comments in dict.h where this function is declared. // int Dict::def_letter_is_okay(void* void_dawg_args, int word_index, const void *void_word, bool word_end) { DawgArgs *dawg_args = reinterpret_cast<DawgArgs*>(void_dawg_args); const WERD_CHOICE *word = reinterpret_cast<const WERD_CHOICE*>(void_word); if (dawg_debug_level >= 3) { tprintf("def_letter_is_okay: word_index=%d word_end=%d" " word=%s num active dawgs=%d num constraints=%d\n", word_index, word_end, word->debug_string(getUnicharset()).string(), dawg_args->active_dawgs->length(), dawg_args->constraints->length()); } // Do not accept words that contain kPatternUnicharID. // (otherwise pattern dawgs would not function correctly). // Do not accept words containing INVALID_UNICHAR_IDs. UNICHAR_ID unichar_id = word->unichar_id(word_index); if (unichar_id == Dawg::kPatternUnicharID || unichar_id == INVALID_UNICHAR_ID) { dawg_args->permuter = NO_PERM; return NO_PERM; } // Initialization. PermuterType current_permuter = NO_PERM; dawg_args->updated_active_dawgs->clear(); const DawgInfoVector &constraints = *(dawg_args->constraints); *dawg_args->updated_constraints = constraints; // Go over the active_dawgs vector and insert DawgInfo records with the // updated ref (an edge with the corresponding unichar id) into // dawg_args->updated_active_dawgs. for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) { const DawgInfo &info = (*dawg_args->active_dawgs)[a]; const Dawg *dawg = dawgs_[info.dawg_index]; // Obtain unichar_id at this position (could be changed later, so this // needs to be inside the loop over all active dawgs). unichar_id = word->unichar_id(word_index); // The number dawg generalizes all digits to be kPatternUnicharID, // so try to match kPatternUnicharID if the current unichar is a digit. if (dawg->type() == DAWG_TYPE_NUMBER && getUnicharset().get_isdigit(unichar_id)) { unichar_id = Dawg::kPatternUnicharID; } // Get the starting node for this letter. NODE_REF node; if (info.ref == NO_EDGE) { node = 0; // beginning to explore this dawg } else { node = dawg->next_node(info.ref); if (node == 0) node = NO_EDGE; // end of word } // Find the edge out of the node for the curent unichar_id. EDGE_REF edge = (node != NO_EDGE) ? dawg->edge_char_of(node, unichar_id, word_end) : NO_EDGE; if (dawg_debug_level >= 3) { tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", info.dawg_index, node, edge); } if (edge != NO_EDGE) { // the unichar was found in the current dawg if (ConstraintsOk(*(dawg_args->updated_constraints), word_end, dawg->type())) { UpdatePermuter(dawg->permuter(), ¤t_permuter); dawg_args->updated_active_dawgs->add_unique( DawgInfo(info.dawg_index, edge), "Append current dawg to updated active dawgs: "); } } else { // the unichar was not found in the current dawg // Handle leading/trailing punctuation dawgs that denote a word pattern // as an edge with kPatternUnicharID. If such an edge is found we add a // constraint denoting the state of the dawg before the word pattern. // This constraint will be applied later when this dawg is found among // successor dawgs as well potentially at the end of the word. if (dawg->type() == DAWG_TYPE_PUNCTUATION) { edge = dawg->edge_char_of(node, Dawg::kPatternUnicharID, word_end); if (edge != NO_EDGE) { dawg_args->updated_constraints->add_unique( DawgInfo(info.dawg_index, edge), "Recording constraint: "); } else { // Do not explore successors of this dawg, since this // must be invalid leading or trailing punctuation. if (dawg_debug_level >= 3) { tprintf("Invalid punctuation from dawg %d\n", info.dawg_index); } continue; } } if (info.ref == NO_EDGE) { if (dawg_debug_level >= 3) { tprintf("No letters matched in dawg %d\n", info.dawg_index); } continue; } // Discard the dawg if the pattern can not end at previous letter. if (edge == NO_EDGE && // previous part is not leading punctuation !dawg->end_of_word(info.ref)) { if (dawg_debug_level >= 3) { tprintf("No valid pattern end in dawg %d\n", info.dawg_index); } continue; } // Look for the unichar in each of this dawg's successors // and append those in which it is found to active_dawgs. const SuccessorList &slist = *(successors_[info.dawg_index]); for (int s = 0; s < slist.length(); ++s) { int sdawg_index = slist[s]; const Dawg *sdawg = dawgs_[sdawg_index]; NODE_REF snode = 0; // Apply constraints to the successor dawg. for (int c = 0; c < constraints.length(); ++c) { // If the successor dawg is described in the constraints change // the start ref from 0 to the one recorded as the constraint. const DawgInfo &cinfo = constraints[c]; if (cinfo.dawg_index == sdawg_index) { snode = sdawg->next_node(cinfo.ref); // Make sure we do not search the successor dawg if after // applying the saved constraint we are at the end of the word. if (snode == 0) snode = NO_EDGE; if (dawg_debug_level >= 3) { tprintf("Applying constraint [%d, " REFFORMAT "]\n", sdawg_index, snode); } } } // Look for the letter in this successor dawg. EDGE_REF sedge = sdawg->edge_char_of( snode, word->unichar_id(word_index), word_end); // If we found the letter append sdawg to the active_dawgs list. if (sedge != NO_EDGE && ConstraintsOk(*(dawg_args->updated_constraints), word_end, dawgs_[sdawg_index]->type())) { UpdatePermuter(sdawg->permuter(), ¤t_permuter); if (sdawg->next_node(sedge) != 0) { // if not word end dawg_args->updated_active_dawgs->add_unique( DawgInfo(sdawg_index, sedge), "Append successor to updated active dawgs: "); } } } // end successors loop } // end if/else } // end for // Update dawg_args->permuter if it used to be NO_PERM or if we found // the current letter in a non-punctuation dawg. This allows preserving // information on which dawg the "core" word came from. if ((current_permuter == PUNC_PERM && current_permuter > dawg_args->permuter) || current_permuter != PUNC_PERM) { dawg_args->permuter = current_permuter; } return dawg_args->permuter; }
/** * @name go_deeper_dawg_fxn * * If the choice being composed so far could be a dictionary word * keep exploring choices. */ void Dict::go_deeper_dawg_fxn( const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args) { DawgArgs *more_args = static_cast<DawgArgs *>(void_more_args); word_ending = (char_choice_index == char_choices.size()-1); int word_index = word->length() - 1; if (best_choice->rating() < *limit) return; // Look up char in DAWG // If the current unichar is an ngram first try calling // letter_is_okay() for each unigram it contains separately. UNICHAR_ID orig_uch_id = word->unichar_id(word_index); bool checked_unigrams = false; if (getUnicharset().get_isngram(orig_uch_id)) { if (dawg_debug_level) { tprintf("checking unigrams in an ngram %s\n", getUnicharset().debug_str(orig_uch_id).string()); } int num_unigrams = 0; word->remove_last_unichar_id(); GenericVector<UNICHAR_ID> encoding; const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id); // Since the string came out of the unicharset, failure is impossible. ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr, nullptr)); bool unigrams_ok = true; // Construct DawgArgs that reflect the current state. DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs); DawgPositionVector unigram_updated_dawgs; DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_updated_dawgs, more_args->permuter); // Check unigrams in the ngram with letter_is_okay(). for (int i = 0; unigrams_ok && i < encoding.size(); ++i) { UNICHAR_ID uch_id = encoding[i]; ASSERT_HOST(uch_id != INVALID_UNICHAR_ID); ++num_unigrams; word->append_unichar_id(uch_id, 1, 0.0, 0.0); unigrams_ok = (this->*letter_is_okay_)( &unigram_dawg_args, word->unichar_id(word_index+num_unigrams-1), word_ending && i == encoding.size() - 1); (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs); if (dawg_debug_level) { tprintf("unigram %s is %s\n", getUnicharset().debug_str(uch_id).string(), unigrams_ok ? "OK" : "not OK"); } } // Restore the word and copy the updated dawg state if needed. while (num_unigrams-- > 0) word->remove_last_unichar_id(); word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0); if (unigrams_ok) { checked_unigrams = true; more_args->permuter = unigram_dawg_args.permuter; *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs); } } // Check which dawgs from the dawgs_ vector contain the word // up to and including the current unichar. if (checked_unigrams || (this->*letter_is_okay_)( more_args, word->unichar_id(word_index), word_ending)) { // Add a new word choice if (word_ending) { if (dawg_debug_level) { tprintf("found word = %s\n", word->debug_string().string()); } if (strcmp(output_ambig_words_file.string(), "") != 0) { if (output_ambig_words_file_ == nullptr) { output_ambig_words_file_ = fopen(output_ambig_words_file.string(), "wb+"); if (output_ambig_words_file_ == nullptr) { tprintf("Failed to open output_ambig_words_file %s\n", output_ambig_words_file.string()); exit(1); } STRING word_str; word->string_and_lengths(&word_str, nullptr); word_str += " "; fprintf(output_ambig_words_file_, "%s", word_str.string()); } STRING word_str; word->string_and_lengths(&word_str, nullptr); word_str += " "; fprintf(output_ambig_words_file_, "%s", word_str.string()); } WERD_CHOICE *adjusted_word = word; adjusted_word->set_permuter(more_args->permuter); update_best_choice(*adjusted_word, best_choice); } else { // search the next letter // Make updated_* point to the next entries in the DawgPositionVector // arrays (that were originally created in dawg_permute_and_select) ++(more_args->updated_dawgs); // Make active_dawgs and constraints point to the updated ones. ++(more_args->active_dawgs); permute_choices(debug, char_choices, char_choice_index + 1, prev_char_frag_info, word, certainties, limit, best_choice, attempts_left, more_args); // Restore previous state to explore another letter in this position. --(more_args->updated_dawgs); --(more_args->active_dawgs); } } else { if (dawg_debug_level) { tprintf("last unichar not OK at index %d in %s\n", word_index, word->debug_string().string()); } } }
/** * @name fragment_state * * Given the current char choice and information about previously seen * fragments, determines whether adjacent character fragments are * present and whether they can be concatenated. * * The given prev_char_frag_info contains: * - fragment: if not nullptr contains information about immediately * preceding fragmented character choice * - num_fragments: number of fragments that have been used so far * to construct a character * - certainty: certainty of the current choice or minimum * certainty of all fragments concatenated so far * - rating: rating of the current choice or sum of fragment * ratings concatenated so far * * The output char_frag_info is filled in as follows: * - character: is set to be nullptr if the choice is a non-matching * or non-ending fragment piece; is set to unichar of the given choice * if it represents a regular character or a matching ending fragment * - fragment,num_fragments,certainty,rating are set as described above * * @returns false if a non-matching fragment is discovered, true otherwise. */ bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info) { const CHAR_FRAGMENT *this_fragment = getUnicharset().get_fragment(curr_unichar_id); const CHAR_FRAGMENT *prev_fragment = prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr; // Print debug info for fragments. if (debug && (prev_fragment || this_fragment)) { tprintf("%s check fragments: choice=%s word_ending=%d\n", debug, getUnicharset().debug_str(curr_unichar_id).string(), word_ending); if (prev_fragment) { tprintf("prev_fragment %s\n", prev_fragment->to_string().string()); } if (this_fragment) { tprintf("this_fragment %s\n", this_fragment->to_string().string()); } } char_frag_info->unichar_id = curr_unichar_id; char_frag_info->fragment = this_fragment; char_frag_info->rating = curr_rating; char_frag_info->certainty = curr_certainty; char_frag_info->num_fragments = 1; if (prev_fragment && !this_fragment) { if (debug) tprintf("Skip choice with incomplete fragment\n"); return false; } if (this_fragment) { // We are dealing with a fragment. char_frag_info->unichar_id = INVALID_UNICHAR_ID; if (prev_fragment) { if (!this_fragment->is_continuation_of(prev_fragment)) { if (debug) tprintf("Non-matching fragment piece\n"); return false; } if (this_fragment->is_ending()) { char_frag_info->unichar_id = getUnicharset().unichar_to_id(this_fragment->get_unichar()); char_frag_info->fragment = nullptr; if (debug) { tprintf("Built character %s from fragments\n", getUnicharset().debug_str( char_frag_info->unichar_id).string()); } } else { if (debug) tprintf("Record fragment continuation\n"); char_frag_info->fragment = this_fragment; } // Update certainty and rating. char_frag_info->rating = prev_char_frag_info->rating + curr_rating; char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1; char_frag_info->certainty = MIN(curr_certainty, prev_char_frag_info->certainty); } else { if (this_fragment->is_beginning()) { if (debug) tprintf("Record fragment beginning\n"); } else { if (debug) { tprintf("Non-starting fragment piece with no prev_fragment\n"); } return false; } } } if (word_ending && char_frag_info->fragment) { if (debug) tprintf("Word can not end with a fragment\n"); return false; } return true; }
/** * @name go_deeper_dawg_fxn * * If the choice being composed so far could be a dictionary word * keep exploring choices. * * There are two modes for deciding whether to go deeper: regular dawg * permuter mode and the special ambigs mode. If *limit is <= 0.0 the * function switches to the ambigs mode (this is the case when * dawg_permute_and_select() function is called from NoDangerousAmbigs()) and * only searches for the first choice that has a rating better than *limit * (in this case ratings are fake, since the real ratings can not be < 0). * Modification of the hyphen state is turned off in the ambigs mode. * When in the regular dawg permuter mode, the function explores all the * possible words and chooses the one with the best rating. The letters with * ratings that are far worse than the ones seen so far are pruned out. */ void Dict::go_deeper_dawg_fxn( const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, void *void_more_args) { DawgArgs *more_args = reinterpret_cast<DawgArgs*>(void_more_args); int word_index = word->length() - 1; bool ambigs_mode = (*limit <= 0.0); if (ambigs_mode) { if (best_choice->rating() < *limit) return; } else { // Prune bad subwords if (more_args->rating_array[word_index] == NO_RATING) { more_args->rating_array[word_index] = word->rating(); } else { float permdawg_limit = more_args->rating_array[word_index] * more_args->rating_margin + kPermDawgRatingPad; if (permdawg_limit < word->rating()) { if (segment_dawg_debug) { tprintf("early pruned word rating=%4.2f," " permdawg_limit=%4.2f, word=%s\n", word->rating(), permdawg_limit, word->debug_string(getUnicharset()).string()); } return; } } } // Deal with hyphens if (word_ending && has_hyphen_end(*word) && !ambigs_mode) { if (segment_dawg_debug) tprintf("new hyphen choice = %s\n", word->debug_string(getUnicharset()).string()); word->set_permuter(more_args->permuter); adjust_word(word, certainties); set_hyphen_word(*word, *(more_args->active_dawgs), *(more_args->constraints)); update_best_choice(*word, best_choice); } else { // Look up char in DAWG // TODO(daria): update the rest of the code that specifies alternative // letter_is_okay_ functions (e.g. TessCharNgram class) to work with // multi-byte unichars and/or unichar ids. // If the current unichar is an ngram first try calling // letter_is_okay() for each unigram it contains separately. UNICHAR_ID orig_uch_id = word->unichar_id(word_index); bool checked_unigrams = false; if (getUnicharset().get_isngram(orig_uch_id)) { if (segment_dawg_debug) { tprintf("checking unigrams in an ngram %s\n", getUnicharset().debug_str(orig_uch_id).string()); } int orig_num_fragments = word->fragment_length(word_index); int num_unigrams = 0; word->remove_last_unichar_id(); const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id); const char *ngram_str_end = ngram_str + strlen(ngram_str); const char *ngram_ptr = ngram_str; bool unigrams_ok = true; // Construct DawgArgs that reflect the current state. DawgInfoVector unigram_active_dawgs = *(more_args->active_dawgs); DawgInfoVector unigram_constraints = *(more_args->constraints); DawgInfoVector unigram_updated_active_dawgs; DawgInfoVector unigram_updated_constraints; DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_constraints, &unigram_updated_active_dawgs, &unigram_updated_constraints, 0.0); unigram_dawg_args.permuter = more_args->permuter; // Check unigrams in the ngram with letter_is_okay(). while (unigrams_ok && ngram_ptr < ngram_str_end) { int step = getUnicharset().step(ngram_ptr); UNICHAR_ID uch_id = (step <= 0) ? INVALID_UNICHAR_ID : getUnicharset().unichar_to_id(ngram_ptr, step); ngram_ptr += step; ++num_unigrams; word->append_unichar_id(uch_id, 1, 0.0, 0.0); unigrams_ok = unigrams_ok && (this->*letter_is_okay_)( &unigram_dawg_args, word_index+num_unigrams-1, word, word_ending && (ngram_ptr == ngram_str_end)); (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_active_dawgs); (*unigram_dawg_args.constraints) = *(unigram_dawg_args.updated_constraints); if (segment_dawg_debug) { tprintf("unigram %s is %s\n", getUnicharset().debug_str(uch_id).string(), unigrams_ok ? "OK" : "not OK"); } } // Restore the word and copy the updated dawg state if needed. while (num_unigrams-- > 0) word->remove_last_unichar_id(); word->append_unichar_id_space_allocated( orig_uch_id, orig_num_fragments, 0.0, 0.0); if (unigrams_ok) { checked_unigrams = true; more_args->permuter = unigram_dawg_args.permuter; *(more_args->updated_active_dawgs) = *(unigram_dawg_args.updated_active_dawgs); *(more_args->updated_constraints) = *(unigram_dawg_args.updated_constraints); } } // Check which dawgs from dawgs_ vector contain the word // up to and including the current unichar. if (checked_unigrams || (this->*letter_is_okay_)(more_args, word_index, word, word_ending)) { // Add a new word choice if (word_ending) { if (segment_dawg_debug) { tprintf("found word = %s\n", word->debug_string(getUnicharset()).string()); } WERD_CHOICE *adjusted_word = word; WERD_CHOICE hyphen_tail_word; if (!ambigs_mode && hyphen_base_size() > 0) { hyphen_tail_word = *word; remove_hyphen_head(&hyphen_tail_word); adjusted_word = &hyphen_tail_word; } adjusted_word->set_permuter(more_args->permuter); if (!ambigs_mode) { adjust_word(adjusted_word, &certainties[hyphen_base_size()]); } update_best_choice(*adjusted_word, best_choice); } else { // search the next letter // Make updated_* point to the next entries in the DawgInfoVector // arrays (that were originally created in dawg_permute_and_select) ++(more_args->updated_active_dawgs); ++(more_args->updated_constraints); // Make active_dawgs and constraints point to the updated ones. ++(more_args->active_dawgs); ++(more_args->constraints); permute_choices(debug, char_choices, char_choice_index + 1, prev_char_frag_info, word, certainties, limit, best_choice, more_args); // Restore previous state to explore another letter in this position. --(more_args->updated_active_dawgs); --(more_args->updated_constraints); --(more_args->active_dawgs); --(more_args->constraints); } } else { if (segment_dawg_debug) { tprintf("last unichar not OK at index %d in %s\n", word_index, word->debug_string(getUnicharset()).string()); } } } }