/** * @name go_deeper_dawg_fxn * * If the choice being composed so far could be a dictionary word * keep exploring choices. */ void Dict::go_deeper_dawg_fxn( const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args) { DawgArgs *more_args = static_cast<DawgArgs *>(void_more_args); word_ending = (char_choice_index == char_choices.size()-1); int word_index = word->length() - 1; if (best_choice->rating() < *limit) return; // Look up char in DAWG // If the current unichar is an ngram first try calling // letter_is_okay() for each unigram it contains separately. UNICHAR_ID orig_uch_id = word->unichar_id(word_index); bool checked_unigrams = false; if (getUnicharset().get_isngram(orig_uch_id)) { if (dawg_debug_level) { tprintf("checking unigrams in an ngram %s\n", getUnicharset().debug_str(orig_uch_id).string()); } int num_unigrams = 0; word->remove_last_unichar_id(); GenericVector<UNICHAR_ID> encoding; const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id); // Since the string came out of the unicharset, failure is impossible. ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr, nullptr)); bool unigrams_ok = true; // Construct DawgArgs that reflect the current state. DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs); DawgPositionVector unigram_updated_dawgs; DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_updated_dawgs, more_args->permuter); // Check unigrams in the ngram with letter_is_okay(). for (int i = 0; unigrams_ok && i < encoding.size(); ++i) { UNICHAR_ID uch_id = encoding[i]; ASSERT_HOST(uch_id != INVALID_UNICHAR_ID); ++num_unigrams; word->append_unichar_id(uch_id, 1, 0.0, 0.0); unigrams_ok = (this->*letter_is_okay_)( &unigram_dawg_args, word->unichar_id(word_index+num_unigrams-1), word_ending && i == encoding.size() - 1); (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs); if (dawg_debug_level) { tprintf("unigram %s is %s\n", getUnicharset().debug_str(uch_id).string(), unigrams_ok ? "OK" : "not OK"); } } // Restore the word and copy the updated dawg state if needed. while (num_unigrams-- > 0) word->remove_last_unichar_id(); word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0); if (unigrams_ok) { checked_unigrams = true; more_args->permuter = unigram_dawg_args.permuter; *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs); } } // Check which dawgs from the dawgs_ vector contain the word // up to and including the current unichar. if (checked_unigrams || (this->*letter_is_okay_)( more_args, word->unichar_id(word_index), word_ending)) { // Add a new word choice if (word_ending) { if (dawg_debug_level) { tprintf("found word = %s\n", word->debug_string().string()); } if (strcmp(output_ambig_words_file.string(), "") != 0) { if (output_ambig_words_file_ == nullptr) { output_ambig_words_file_ = fopen(output_ambig_words_file.string(), "wb+"); if (output_ambig_words_file_ == nullptr) { tprintf("Failed to open output_ambig_words_file %s\n", output_ambig_words_file.string()); exit(1); } STRING word_str; word->string_and_lengths(&word_str, nullptr); word_str += " "; fprintf(output_ambig_words_file_, "%s", word_str.string()); } STRING word_str; word->string_and_lengths(&word_str, nullptr); word_str += " "; fprintf(output_ambig_words_file_, "%s", word_str.string()); } WERD_CHOICE *adjusted_word = word; adjusted_word->set_permuter(more_args->permuter); update_best_choice(*adjusted_word, best_choice); } else { // search the next letter // Make updated_* point to the next entries in the DawgPositionVector // arrays (that were originally created in dawg_permute_and_select) ++(more_args->updated_dawgs); // Make active_dawgs and constraints point to the updated ones. ++(more_args->active_dawgs); permute_choices(debug, char_choices, char_choice_index + 1, prev_char_frag_info, word, certainties, limit, best_choice, attempts_left, more_args); // Restore previous state to explore another letter in this position. --(more_args->updated_dawgs); --(more_args->active_dawgs); } } else { if (dawg_debug_level) { tprintf("last unichar not OK at index %d in %s\n", word_index, word->debug_string().string()); } } }
/** * @name go_deeper_dawg_fxn * * If the choice being composed so far could be a dictionary word * keep exploring choices. * * There are two modes for deciding whether to go deeper: regular dawg * permuter mode and the special ambigs mode. If *limit is <= 0.0 the * function switches to the ambigs mode (this is the case when * dawg_permute_and_select() function is called from NoDangerousAmbigs()) and * only searches for the first choice that has a rating better than *limit * (in this case ratings are fake, since the real ratings can not be < 0). * Modification of the hyphen state is turned off in the ambigs mode. * When in the regular dawg permuter mode, the function explores all the * possible words and chooses the one with the best rating. The letters with * ratings that are far worse than the ones seen so far are pruned out. */ void Dict::go_deeper_dawg_fxn( const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, void *void_more_args) { DawgArgs *more_args = reinterpret_cast<DawgArgs*>(void_more_args); int word_index = word->length() - 1; bool ambigs_mode = (*limit <= 0.0); if (ambigs_mode) { if (best_choice->rating() < *limit) return; } else { // Prune bad subwords if (more_args->rating_array[word_index] == NO_RATING) { more_args->rating_array[word_index] = word->rating(); } else { float permdawg_limit = more_args->rating_array[word_index] * more_args->rating_margin + kPermDawgRatingPad; if (permdawg_limit < word->rating()) { if (segment_dawg_debug) { tprintf("early pruned word rating=%4.2f," " permdawg_limit=%4.2f, word=%s\n", word->rating(), permdawg_limit, word->debug_string(getUnicharset()).string()); } return; } } } // Deal with hyphens if (word_ending && has_hyphen_end(*word) && !ambigs_mode) { if (segment_dawg_debug) tprintf("new hyphen choice = %s\n", word->debug_string(getUnicharset()).string()); word->set_permuter(more_args->permuter); adjust_word(word, certainties); set_hyphen_word(*word, *(more_args->active_dawgs), *(more_args->constraints)); update_best_choice(*word, best_choice); } else { // Look up char in DAWG // TODO(daria): update the rest of the code that specifies alternative // letter_is_okay_ functions (e.g. TessCharNgram class) to work with // multi-byte unichars and/or unichar ids. // If the current unichar is an ngram first try calling // letter_is_okay() for each unigram it contains separately. UNICHAR_ID orig_uch_id = word->unichar_id(word_index); bool checked_unigrams = false; if (getUnicharset().get_isngram(orig_uch_id)) { if (segment_dawg_debug) { tprintf("checking unigrams in an ngram %s\n", getUnicharset().debug_str(orig_uch_id).string()); } int orig_num_fragments = word->fragment_length(word_index); int num_unigrams = 0; word->remove_last_unichar_id(); const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id); const char *ngram_str_end = ngram_str + strlen(ngram_str); const char *ngram_ptr = ngram_str; bool unigrams_ok = true; // Construct DawgArgs that reflect the current state. DawgInfoVector unigram_active_dawgs = *(more_args->active_dawgs); DawgInfoVector unigram_constraints = *(more_args->constraints); DawgInfoVector unigram_updated_active_dawgs; DawgInfoVector unigram_updated_constraints; DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_constraints, &unigram_updated_active_dawgs, &unigram_updated_constraints, 0.0); unigram_dawg_args.permuter = more_args->permuter; // Check unigrams in the ngram with letter_is_okay(). while (unigrams_ok && ngram_ptr < ngram_str_end) { int step = getUnicharset().step(ngram_ptr); UNICHAR_ID uch_id = (step <= 0) ? INVALID_UNICHAR_ID : getUnicharset().unichar_to_id(ngram_ptr, step); ngram_ptr += step; ++num_unigrams; word->append_unichar_id(uch_id, 1, 0.0, 0.0); unigrams_ok = unigrams_ok && (this->*letter_is_okay_)( &unigram_dawg_args, word_index+num_unigrams-1, word, word_ending && (ngram_ptr == ngram_str_end)); (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_active_dawgs); (*unigram_dawg_args.constraints) = *(unigram_dawg_args.updated_constraints); if (segment_dawg_debug) { tprintf("unigram %s is %s\n", getUnicharset().debug_str(uch_id).string(), unigrams_ok ? "OK" : "not OK"); } } // Restore the word and copy the updated dawg state if needed. while (num_unigrams-- > 0) word->remove_last_unichar_id(); word->append_unichar_id_space_allocated( orig_uch_id, orig_num_fragments, 0.0, 0.0); if (unigrams_ok) { checked_unigrams = true; more_args->permuter = unigram_dawg_args.permuter; *(more_args->updated_active_dawgs) = *(unigram_dawg_args.updated_active_dawgs); *(more_args->updated_constraints) = *(unigram_dawg_args.updated_constraints); } } // Check which dawgs from dawgs_ vector contain the word // up to and including the current unichar. if (checked_unigrams || (this->*letter_is_okay_)(more_args, word_index, word, word_ending)) { // Add a new word choice if (word_ending) { if (segment_dawg_debug) { tprintf("found word = %s\n", word->debug_string(getUnicharset()).string()); } WERD_CHOICE *adjusted_word = word; WERD_CHOICE hyphen_tail_word; if (!ambigs_mode && hyphen_base_size() > 0) { hyphen_tail_word = *word; remove_hyphen_head(&hyphen_tail_word); adjusted_word = &hyphen_tail_word; } adjusted_word->set_permuter(more_args->permuter); if (!ambigs_mode) { adjust_word(adjusted_word, &certainties[hyphen_base_size()]); } update_best_choice(*adjusted_word, best_choice); } else { // search the next letter // Make updated_* point to the next entries in the DawgInfoVector // arrays (that were originally created in dawg_permute_and_select) ++(more_args->updated_active_dawgs); ++(more_args->updated_constraints); // Make active_dawgs and constraints point to the updated ones. ++(more_args->active_dawgs); ++(more_args->constraints); permute_choices(debug, char_choices, char_choice_index + 1, prev_char_frag_info, word, certainties, limit, best_choice, more_args); // Restore previous state to explore another letter in this position. --(more_args->updated_active_dawgs); --(more_args->updated_constraints); --(more_args->active_dawgs); --(more_args->constraints); } } else { if (segment_dawg_debug) { tprintf("last unichar not OK at index %d in %s\n", word_index, word->debug_string(getUnicharset()).string()); } } } }