/** * BLOB_CHOICE::BLOB_CHOICE * * Constructor to build a BLOB_CHOICE from another BLOB_CHOICE. */ BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) { unichar_id_ = other.unichar_id(); rating_ = other.rating(); certainty_ = other.certainty(); config_ = other.config(); script_id_ = other.script_id(); }
/** * BLOB_CHOICE::BLOB_CHOICE * * Constructor to build a BLOB_CHOICE from another BLOB_CHOICE. */ BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) { unichar_id_ = other.unichar_id(); rating_ = other.rating(); certainty_ = other.certainty(); config_ = other.config(); config2_ = other.config2(); script_id_ = other.script_id(); language_model_state_ = NULL; }
/** * BLOB_CHOICE::BLOB_CHOICE * * Constructor to build a BLOB_CHOICE from another BLOB_CHOICE. */ BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) { unichar_id_ = other.unichar_id(); rating_ = other.rating(); certainty_ = other.certainty(); fontinfo_id_ = other.fontinfo_id(); fontinfo_id2_ = other.fontinfo_id2(); script_id_ = other.script_id(); language_model_state_ = NULL; min_xheight_ = other.min_xheight_; max_xheight_ = other.max_xheight_; adapted_ = other.adapted_; }
// Helper to find the BLOB_CHOICE in the bc_list that matches the given // unichar_id, or NULL if there is no match. BLOB_CHOICE* FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST* bc_list) { // Find the corresponding best BLOB_CHOICE. BLOB_CHOICE_IT choice_it(bc_list); for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { BLOB_CHOICE* choice = choice_it.data(); if (choice->unichar_id() == char_id) { return choice; } } return NULL; }
/** * BLOB_CHOICE::BLOB_CHOICE * * Constructor to build a BLOB_CHOICE from another BLOB_CHOICE. */ BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) { unichar_id_ = other.unichar_id(); rating_ = other.rating(); certainty_ = other.certainty(); fontinfo_id_ = other.fontinfo_id(); fontinfo_id2_ = other.fontinfo_id2(); script_id_ = other.script_id(); matrix_cell_ = other.matrix_cell_; min_xheight_ = other.min_xheight_; max_xheight_ = other.max_xheight_; yshift_ = other.yshift(); classifier_ = other.classifier_; }
/** * append_choices * * Checks to see whether or not the next choice is worth appending to * the word being generated. If so then keeps going deeper into the word. * * This function assumes that Dict::go_deeper_fxn_ is set. */ void Dict::append_choices( const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args) { int word_ending = (char_choice_index == char_choices.length() - 1) ? true : false; // Deal with fragments. CHAR_FRAGMENT_INFO char_frag_info; if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(), blob_choice.certainty(), prev_char_frag_info, debug, word_ending, &char_frag_info)) { return; // blob_choice must be an invalid fragment } // Search the next letter if this character is a fragment. if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) { permute_choices(debug, char_choices, char_choice_index + 1, &char_frag_info, word, certainties, limit, best_choice, attempts_left, more_args); return; } // Add the next unichar. float old_rating = word->rating(); float old_certainty = word->certainty(); uint8_t old_permuter = word->permuter(); certainties[word->length()] = char_frag_info.certainty; word->append_unichar_id_space_allocated( char_frag_info.unichar_id, char_frag_info.num_fragments, char_frag_info.rating, char_frag_info.certainty); // Explore the next unichar. (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index, &char_frag_info, word_ending, word, certainties, limit, best_choice, attempts_left, more_args); // Remove the unichar we added to explore other choices in it's place. word->remove_last_unichar_id(); word->set_rating(old_rating); word->set_certainty(old_certainty); word->set_permuter(old_permuter); }
// Creates a fake blob choice from the combination of the given fragments. // unichar is the class to be made from the combination, // expanded_fragment_lengths[choice_index] is the number of fragments to use. // old_choices[choice_index] has the classifier output for each fragment. // choice index initially indexes the last fragment and should be decremented // expanded_fragment_lengths[choice_index] times to get the earlier fragments. // Guarantees to return something non-null, or abort! BLOB_CHOICE* Wordrec::rebuild_fragments( const char* unichar, const char* expanded_fragment_lengths, int choice_index, BLOB_CHOICE_LIST_VECTOR *old_choices) { float rating = 0.0f; float certainty = 0.0f; inT16 min_xheight = -MAX_INT16; inT16 max_xheight = MAX_INT16; for (int fragment_pieces = expanded_fragment_lengths[choice_index] - 1; fragment_pieces >= 0; --fragment_pieces, --choice_index) { // Get a pointer to the classifier results from the old_choices. BLOB_CHOICE_LIST *current_choices = old_choices->get(choice_index); // Populate fragment with updated values and look for the // fragment with the same values in current_choices. // Update rating and certainty of the character being composed. CHAR_FRAGMENT fragment; fragment.set_all(unichar, fragment_pieces, expanded_fragment_lengths[choice_index], false); BLOB_CHOICE_IT choice_it(current_choices); for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { BLOB_CHOICE* choice = choice_it.data(); const CHAR_FRAGMENT *current_fragment = getDict().getUnicharset().get_fragment(choice->unichar_id()); if (current_fragment && fragment.equals(current_fragment)) { rating += choice->rating(); if (choice->certainty() < certainty) { certainty = choice->certainty(); } IntersectRange(choice->min_xheight(), choice->max_xheight(), &min_xheight, &max_xheight); break; } } if (choice_it.cycled_list()) { print_ratings_list("Failure", current_choices, unicharset); tprintf("Failed to find fragment %s at index=%d\n", fragment.to_string().string(), choice_index); } ASSERT_HOST(!choice_it.cycled_list()); // Be sure we found the fragment. } return new BLOB_CHOICE(getDict().getUnicharset().unichar_to_id(unichar), rating, certainty, -1, -1, 0, min_xheight, max_xheight, false); }
/// Recursive helper to find a match to the target_text (from text_index /// position) in the choices (from choices_pos position). /// @param choices is an array of GenericVectors, of length choices_length, /// with each element representing a starting position in the word, and the /// #GenericVector holding classification results for a sequence of consecutive /// blobs, with index 0 being a single blob, index 1 being 2 blobs etc. /// @param choices_pos /// @param choices_length /// @param target_text /// @param text_index /// @param rating /// @param segmentation /// @param best_rating /// @param best_segmentation void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices, int choices_pos, int choices_length, const GenericVector<UNICHAR_ID>& target_text, int text_index, float rating, GenericVector<int>* segmentation, float* best_rating, GenericVector<int>* best_segmentation) { const UnicharAmbigsVector& table = getDict().getUnicharAmbigs().dang_ambigs(); for (int length = 1; length <= choices[choices_pos].size(); ++length) { // Rating of matching choice or worst choice if no match. float choice_rating = 0.0f; // Find the corresponding best BLOB_CHOICE. BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]); for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { BLOB_CHOICE* choice = choice_it.data(); choice_rating = choice->rating(); UNICHAR_ID class_id = choice->unichar_id(); if (class_id == target_text[text_index]) { break; } // Search ambigs table. if (class_id < table.size() && table[class_id] != NULL) { AmbigSpec_IT spec_it(table[class_id]); for (spec_it.mark_cycle_pt(); !spec_it.cycled_list(); spec_it.forward()) { const AmbigSpec *ambig_spec = spec_it.data(); // We'll only do 1-1. if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID && ambig_spec->correct_ngram_id == target_text[text_index]) break; } if (!spec_it.cycled_list()) break; // Found an ambig. } } if (choice_it.cycled_list()) continue; // No match. segmentation->push_back(length); if (choices_pos + length == choices_length && text_index + 1 == target_text.size()) { // This is a complete match. If the rating is good record a new best. if (applybox_debug > 2) { tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n", rating + choice_rating, *best_rating, segmentation->size(), best_segmentation->size()); } if (best_segmentation->empty() || rating + choice_rating < *best_rating) { *best_segmentation = *segmentation; *best_rating = rating + choice_rating; } } else if (choices_pos + length < choices_length && text_index + 1 < target_text.size()) { if (applybox_debug > 3) { tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n", target_text[text_index], unicharset.id_to_unichar(target_text[text_index]), choice_it.data()->unichar_id() == target_text[text_index] ? "Match" : "Ambig", choices_pos, length); } SearchForText(choices, choices_pos + length, choices_length, target_text, text_index + 1, rating + choice_rating, segmentation, best_rating, best_segmentation); if (applybox_debug > 3) { tprintf("End recursion for %d=%s\n", target_text[text_index], unicharset.id_to_unichar(target_text[text_index])); } } segmentation->truncate(segmentation->size() - 1); } }