/** * dawg_permute_and_select * * Recursively explore all the possible character combinations in * the given char_choices. Use go_deeper_dawg_fxn() to search all the * dawgs in the dawgs_ vector in parallel and discard invalid words. * * Allocate and return a WERD_CHOICE with the best valid word found. */ WERD_CHOICE *Dict::dawg_permute_and_select( const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) { WERD_CHOICE *best_choice = new WERD_CHOICE(); best_choice->make_bad(); best_choice->set_rating(rating_limit); if (char_choices.length() == 0) return best_choice; DawgInfoVector *active_dawgs = new DawgInfoVector[char_choices.length() + 1]; DawgInfoVector *constraints = new DawgInfoVector[char_choices.length() + 1]; init_active_dawgs(&(active_dawgs[0])); init_constraints(&(constraints[0])); DawgArgs dawg_args(&(active_dawgs[0]), &(constraints[0]), &(active_dawgs[1]), &(constraints[1]), (segment_penalty_dict_case_bad / segment_penalty_dict_case_ok)); WERD_CHOICE word(MAX_WERD_LENGTH); copy_hyphen_info(&word); // Discard rating and certainty of the hyphen base (if any). word.set_rating(0.0); word.set_certainty(0.0); if (word.length() + char_choices.length() > MAX_WERD_LENGTH) { delete[] active_dawgs; delete[] constraints; return best_choice; // the word is too long to permute } float certainties[MAX_WERD_LENGTH]; this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn; permute_choices(segment_dawg_debug ? "segment_dawg_debug" : NULL, char_choices, 0, NULL, &word, certainties, &rating_limit, best_choice, &dawg_args); delete[] active_dawgs; delete[] constraints; return best_choice; }
/** * append_choices * * Checks to see whether or not the next choice is worth appending to * the word being generated. If so then keeps going deeper into the word. * * This function assumes that Dict::go_deeper_fxn_ is set. */ void Dict::append_choices( const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args) { int word_ending = (char_choice_index == char_choices.length() - 1) ? true : false; // Deal with fragments. CHAR_FRAGMENT_INFO char_frag_info; if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(), blob_choice.certainty(), prev_char_frag_info, debug, word_ending, &char_frag_info)) { return; // blob_choice must be an invalid fragment } // Search the next letter if this character is a fragment. if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) { permute_choices(debug, char_choices, char_choice_index + 1, &char_frag_info, word, certainties, limit, best_choice, attempts_left, more_args); return; } // Add the next unichar. float old_rating = word->rating(); float old_certainty = word->certainty(); uint8_t old_permuter = word->permuter(); certainties[word->length()] = char_frag_info.certainty; word->append_unichar_id_space_allocated( char_frag_info.unichar_id, char_frag_info.num_fragments, char_frag_info.rating, char_frag_info.certainty); // Explore the next unichar. (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index, &char_frag_info, word_ending, word, certainties, limit, best_choice, attempts_left, more_args); // Remove the unichar we added to explore other choices in it's place. word->remove_last_unichar_id(); word->set_rating(old_rating); word->set_certainty(old_certainty); word->set_permuter(old_permuter); }
/** * dawg_permute_and_select * * Recursively explore all the possible character combinations in * the given char_choices. Use go_deeper_dawg_fxn() to search all the * dawgs in the dawgs_ vector in parallel and discard invalid words. * * Allocate and return a WERD_CHOICE with the best valid word found. */ WERD_CHOICE *Dict::dawg_permute_and_select( const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) { WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset()); best_choice->make_bad(); best_choice->set_rating(rating_limit); if (char_choices.length() == 0 || char_choices.length() > MAX_WERD_LENGTH) return best_choice; DawgPositionVector *active_dawgs = new DawgPositionVector[char_choices.length() + 1]; init_active_dawgs(&(active_dawgs[0]), true); DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM); WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH); float certainties[MAX_WERD_LENGTH]; this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn; int attempts_left = max_permuter_attempts; permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr, char_choices, 0, nullptr, &word, certainties, &rating_limit, best_choice, &attempts_left, &dawg_args); delete[] active_dawgs; return best_choice; }
/** * @name go_deeper_dawg_fxn * * If the choice being composed so far could be a dictionary word * keep exploring choices. */ void Dict::go_deeper_dawg_fxn( const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args) { DawgArgs *more_args = static_cast<DawgArgs *>(void_more_args); word_ending = (char_choice_index == char_choices.size()-1); int word_index = word->length() - 1; if (best_choice->rating() < *limit) return; // Look up char in DAWG // If the current unichar is an ngram first try calling // letter_is_okay() for each unigram it contains separately. UNICHAR_ID orig_uch_id = word->unichar_id(word_index); bool checked_unigrams = false; if (getUnicharset().get_isngram(orig_uch_id)) { if (dawg_debug_level) { tprintf("checking unigrams in an ngram %s\n", getUnicharset().debug_str(orig_uch_id).string()); } int num_unigrams = 0; word->remove_last_unichar_id(); GenericVector<UNICHAR_ID> encoding; const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id); // Since the string came out of the unicharset, failure is impossible. ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr, nullptr)); bool unigrams_ok = true; // Construct DawgArgs that reflect the current state. DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs); DawgPositionVector unigram_updated_dawgs; DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_updated_dawgs, more_args->permuter); // Check unigrams in the ngram with letter_is_okay(). for (int i = 0; unigrams_ok && i < encoding.size(); ++i) { UNICHAR_ID uch_id = encoding[i]; ASSERT_HOST(uch_id != INVALID_UNICHAR_ID); ++num_unigrams; word->append_unichar_id(uch_id, 1, 0.0, 0.0); unigrams_ok = (this->*letter_is_okay_)( &unigram_dawg_args, word->unichar_id(word_index+num_unigrams-1), word_ending && i == encoding.size() - 1); (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs); if (dawg_debug_level) { tprintf("unigram %s is %s\n", getUnicharset().debug_str(uch_id).string(), unigrams_ok ? "OK" : "not OK"); } } // Restore the word and copy the updated dawg state if needed. while (num_unigrams-- > 0) word->remove_last_unichar_id(); word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0); if (unigrams_ok) { checked_unigrams = true; more_args->permuter = unigram_dawg_args.permuter; *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs); } } // Check which dawgs from the dawgs_ vector contain the word // up to and including the current unichar. if (checked_unigrams || (this->*letter_is_okay_)( more_args, word->unichar_id(word_index), word_ending)) { // Add a new word choice if (word_ending) { if (dawg_debug_level) { tprintf("found word = %s\n", word->debug_string().string()); } if (strcmp(output_ambig_words_file.string(), "") != 0) { if (output_ambig_words_file_ == nullptr) { output_ambig_words_file_ = fopen(output_ambig_words_file.string(), "wb+"); if (output_ambig_words_file_ == nullptr) { tprintf("Failed to open output_ambig_words_file %s\n", output_ambig_words_file.string()); exit(1); } STRING word_str; word->string_and_lengths(&word_str, nullptr); word_str += " "; fprintf(output_ambig_words_file_, "%s", word_str.string()); } STRING word_str; word->string_and_lengths(&word_str, nullptr); word_str += " "; fprintf(output_ambig_words_file_, "%s", word_str.string()); } WERD_CHOICE *adjusted_word = word; adjusted_word->set_permuter(more_args->permuter); update_best_choice(*adjusted_word, best_choice); } else { // search the next letter // Make updated_* point to the next entries in the DawgPositionVector // arrays (that were originally created in dawg_permute_and_select) ++(more_args->updated_dawgs); // Make active_dawgs and constraints point to the updated ones. ++(more_args->active_dawgs); permute_choices(debug, char_choices, char_choice_index + 1, prev_char_frag_info, word, certainties, limit, best_choice, attempts_left, more_args); // Restore previous state to explore another letter in this position. --(more_args->updated_dawgs); --(more_args->active_dawgs); } } else { if (dawg_debug_level) { tprintf("last unichar not OK at index %d in %s\n", word_index, word->debug_string().string()); } } }
/** * @name go_deeper_dawg_fxn * * If the choice being composed so far could be a dictionary word * keep exploring choices. * * There are two modes for deciding whether to go deeper: regular dawg * permuter mode and the special ambigs mode. If *limit is <= 0.0 the * function switches to the ambigs mode (this is the case when * dawg_permute_and_select() function is called from NoDangerousAmbigs()) and * only searches for the first choice that has a rating better than *limit * (in this case ratings are fake, since the real ratings can not be < 0). * Modification of the hyphen state is turned off in the ambigs mode. * When in the regular dawg permuter mode, the function explores all the * possible words and chooses the one with the best rating. The letters with * ratings that are far worse than the ones seen so far are pruned out. */ void Dict::go_deeper_dawg_fxn( const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, void *void_more_args) { DawgArgs *more_args = reinterpret_cast<DawgArgs*>(void_more_args); int word_index = word->length() - 1; bool ambigs_mode = (*limit <= 0.0); if (ambigs_mode) { if (best_choice->rating() < *limit) return; } else { // Prune bad subwords if (more_args->rating_array[word_index] == NO_RATING) { more_args->rating_array[word_index] = word->rating(); } else { float permdawg_limit = more_args->rating_array[word_index] * more_args->rating_margin + kPermDawgRatingPad; if (permdawg_limit < word->rating()) { if (segment_dawg_debug) { tprintf("early pruned word rating=%4.2f," " permdawg_limit=%4.2f, word=%s\n", word->rating(), permdawg_limit, word->debug_string(getUnicharset()).string()); } return; } } } // Deal with hyphens if (word_ending && has_hyphen_end(*word) && !ambigs_mode) { if (segment_dawg_debug) tprintf("new hyphen choice = %s\n", word->debug_string(getUnicharset()).string()); word->set_permuter(more_args->permuter); adjust_word(word, certainties); set_hyphen_word(*word, *(more_args->active_dawgs), *(more_args->constraints)); update_best_choice(*word, best_choice); } else { // Look up char in DAWG // TODO(daria): update the rest of the code that specifies alternative // letter_is_okay_ functions (e.g. TessCharNgram class) to work with // multi-byte unichars and/or unichar ids. // If the current unichar is an ngram first try calling // letter_is_okay() for each unigram it contains separately. UNICHAR_ID orig_uch_id = word->unichar_id(word_index); bool checked_unigrams = false; if (getUnicharset().get_isngram(orig_uch_id)) { if (segment_dawg_debug) { tprintf("checking unigrams in an ngram %s\n", getUnicharset().debug_str(orig_uch_id).string()); } int orig_num_fragments = word->fragment_length(word_index); int num_unigrams = 0; word->remove_last_unichar_id(); const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id); const char *ngram_str_end = ngram_str + strlen(ngram_str); const char *ngram_ptr = ngram_str; bool unigrams_ok = true; // Construct DawgArgs that reflect the current state. DawgInfoVector unigram_active_dawgs = *(more_args->active_dawgs); DawgInfoVector unigram_constraints = *(more_args->constraints); DawgInfoVector unigram_updated_active_dawgs; DawgInfoVector unigram_updated_constraints; DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_constraints, &unigram_updated_active_dawgs, &unigram_updated_constraints, 0.0); unigram_dawg_args.permuter = more_args->permuter; // Check unigrams in the ngram with letter_is_okay(). while (unigrams_ok && ngram_ptr < ngram_str_end) { int step = getUnicharset().step(ngram_ptr); UNICHAR_ID uch_id = (step <= 0) ? INVALID_UNICHAR_ID : getUnicharset().unichar_to_id(ngram_ptr, step); ngram_ptr += step; ++num_unigrams; word->append_unichar_id(uch_id, 1, 0.0, 0.0); unigrams_ok = unigrams_ok && (this->*letter_is_okay_)( &unigram_dawg_args, word_index+num_unigrams-1, word, word_ending && (ngram_ptr == ngram_str_end)); (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_active_dawgs); (*unigram_dawg_args.constraints) = *(unigram_dawg_args.updated_constraints); if (segment_dawg_debug) { tprintf("unigram %s is %s\n", getUnicharset().debug_str(uch_id).string(), unigrams_ok ? "OK" : "not OK"); } } // Restore the word and copy the updated dawg state if needed. while (num_unigrams-- > 0) word->remove_last_unichar_id(); word->append_unichar_id_space_allocated( orig_uch_id, orig_num_fragments, 0.0, 0.0); if (unigrams_ok) { checked_unigrams = true; more_args->permuter = unigram_dawg_args.permuter; *(more_args->updated_active_dawgs) = *(unigram_dawg_args.updated_active_dawgs); *(more_args->updated_constraints) = *(unigram_dawg_args.updated_constraints); } } // Check which dawgs from dawgs_ vector contain the word // up to and including the current unichar. if (checked_unigrams || (this->*letter_is_okay_)(more_args, word_index, word, word_ending)) { // Add a new word choice if (word_ending) { if (segment_dawg_debug) { tprintf("found word = %s\n", word->debug_string(getUnicharset()).string()); } WERD_CHOICE *adjusted_word = word; WERD_CHOICE hyphen_tail_word; if (!ambigs_mode && hyphen_base_size() > 0) { hyphen_tail_word = *word; remove_hyphen_head(&hyphen_tail_word); adjusted_word = &hyphen_tail_word; } adjusted_word->set_permuter(more_args->permuter); if (!ambigs_mode) { adjust_word(adjusted_word, &certainties[hyphen_base_size()]); } update_best_choice(*adjusted_word, best_choice); } else { // search the next letter // Make updated_* point to the next entries in the DawgInfoVector // arrays (that were originally created in dawg_permute_and_select) ++(more_args->updated_active_dawgs); ++(more_args->updated_constraints); // Make active_dawgs and constraints point to the updated ones. ++(more_args->active_dawgs); ++(more_args->constraints); permute_choices(debug, char_choices, char_choice_index + 1, prev_char_frag_info, word, certainties, limit, best_choice, more_args); // Restore previous state to explore another letter in this position. --(more_args->updated_active_dawgs); --(more_args->updated_constraints); --(more_args->active_dawgs); --(more_args->constraints); } } else { if (segment_dawg_debug) { tprintf("last unichar not OK at index %d in %s\n", word_index, word->debug_string(getUnicharset()).string()); } } } }