/** * BLOB_CHOICE::BLOB_CHOICE * * Constructor to build a BLOB_CHOICE from another BLOB_CHOICE. */ BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) { unichar_id_ = other.unichar_id(); rating_ = other.rating(); certainty_ = other.certainty(); config_ = other.config(); script_id_ = other.script_id(); }
/** * BLOB_CHOICE::BLOB_CHOICE * * Constructor to build a BLOB_CHOICE from another BLOB_CHOICE. */ BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) { unichar_id_ = other.unichar_id(); rating_ = other.rating(); certainty_ = other.certainty(); config_ = other.config(); config2_ = other.config2(); script_id_ = other.script_id(); language_model_state_ = NULL; }
/** * BLOB_CHOICE::BLOB_CHOICE * * Constructor to build a BLOB_CHOICE from another BLOB_CHOICE. */ BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) { unichar_id_ = other.unichar_id(); rating_ = other.rating(); certainty_ = other.certainty(); fontinfo_id_ = other.fontinfo_id(); fontinfo_id2_ = other.fontinfo_id2(); script_id_ = other.script_id(); language_model_state_ = NULL; min_xheight_ = other.min_xheight_; max_xheight_ = other.max_xheight_; adapted_ = other.adapted_; }
/** * BLOB_CHOICE::BLOB_CHOICE * * Constructor to build a BLOB_CHOICE from another BLOB_CHOICE. */ BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) { unichar_id_ = other.unichar_id(); rating_ = other.rating(); certainty_ = other.certainty(); fontinfo_id_ = other.fontinfo_id(); fontinfo_id2_ = other.fontinfo_id2(); script_id_ = other.script_id(); matrix_cell_ = other.matrix_cell_; min_xheight_ = other.min_xheight_; max_xheight_ = other.max_xheight_; yshift_ = other.yshift(); classifier_ = other.classifier_; }
/** * append_choices * * Checks to see whether or not the next choice is worth appending to * the word being generated. If so then keeps going deeper into the word. * * This function assumes that Dict::go_deeper_fxn_ is set. */ void Dict::append_choices( const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args) { int word_ending = (char_choice_index == char_choices.length() - 1) ? true : false; // Deal with fragments. CHAR_FRAGMENT_INFO char_frag_info; if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(), blob_choice.certainty(), prev_char_frag_info, debug, word_ending, &char_frag_info)) { return; // blob_choice must be an invalid fragment } // Search the next letter if this character is a fragment. if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) { permute_choices(debug, char_choices, char_choice_index + 1, &char_frag_info, word, certainties, limit, best_choice, attempts_left, more_args); return; } // Add the next unichar. float old_rating = word->rating(); float old_certainty = word->certainty(); uint8_t old_permuter = word->permuter(); certainties[word->length()] = char_frag_info.certainty; word->append_unichar_id_space_allocated( char_frag_info.unichar_id, char_frag_info.num_fragments, char_frag_info.rating, char_frag_info.certainty); // Explore the next unichar. (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index, &char_frag_info, word_ending, word, certainties, limit, best_choice, attempts_left, more_args); // Remove the unichar we added to explore other choices in it's place. word->remove_last_unichar_id(); word->set_rating(old_rating); word->set_certainty(old_certainty); word->set_permuter(old_permuter); }
// Creates a fake blob choice from the combination of the given fragments. // unichar is the class to be made from the combination, // expanded_fragment_lengths[choice_index] is the number of fragments to use. // old_choices[choice_index] has the classifier output for each fragment. // choice index initially indexes the last fragment and should be decremented // expanded_fragment_lengths[choice_index] times to get the earlier fragments. // Guarantees to return something non-null, or abort! BLOB_CHOICE* Wordrec::rebuild_fragments( const char* unichar, const char* expanded_fragment_lengths, int choice_index, BLOB_CHOICE_LIST_VECTOR *old_choices) { float rating = 0.0f; float certainty = 0.0f; inT16 min_xheight = -MAX_INT16; inT16 max_xheight = MAX_INT16; for (int fragment_pieces = expanded_fragment_lengths[choice_index] - 1; fragment_pieces >= 0; --fragment_pieces, --choice_index) { // Get a pointer to the classifier results from the old_choices. BLOB_CHOICE_LIST *current_choices = old_choices->get(choice_index); // Populate fragment with updated values and look for the // fragment with the same values in current_choices. // Update rating and certainty of the character being composed. CHAR_FRAGMENT fragment; fragment.set_all(unichar, fragment_pieces, expanded_fragment_lengths[choice_index], false); BLOB_CHOICE_IT choice_it(current_choices); for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { BLOB_CHOICE* choice = choice_it.data(); const CHAR_FRAGMENT *current_fragment = getDict().getUnicharset().get_fragment(choice->unichar_id()); if (current_fragment && fragment.equals(current_fragment)) { rating += choice->rating(); if (choice->certainty() < certainty) { certainty = choice->certainty(); } IntersectRange(choice->min_xheight(), choice->max_xheight(), &min_xheight, &max_xheight); break; } } if (choice_it.cycled_list()) { print_ratings_list("Failure", current_choices, unicharset); tprintf("Failed to find fragment %s at index=%d\n", fragment.to_string().string(), choice_index); } ASSERT_HOST(!choice_it.cycled_list()); // Be sure we found the fragment. } return new BLOB_CHOICE(getDict().getUnicharset().unichar_to_id(unichar), rating, certainty, -1, -1, 0, min_xheight, max_xheight, false); }
LIST call_matcher( //call a matcher TBLOB *ptblob, //previous TBLOB *tessblob, //blob to match TBLOB *ntblob, //next void *, //unused parameter TEXTROW * //always null anyway ) { PBLOB *pblob; //converted blob PBLOB *blob; //converted blob PBLOB *nblob; //converted blob LIST result; //tess output BLOB_CHOICE *choice; //current choice BLOB_CHOICE_LIST ratings; //matcher result BLOB_CHOICE_IT it; //iterator char choice_lengths[2] = {0, 0}; blob = make_ed_blob (tessblob);//convert blob if (blob == NULL) { // Since it is actually possible to get a NULL blob here, due to invalid // segmentations, fake a really bad classification. choice_lengths[0] = strlen(unicharset.id_to_unichar(1)); return append_choice(NULL, unicharset.id_to_unichar(1), choice_lengths, static_cast<float>(MAX_NUM_INT_FEATURES), static_cast<float>(kReallyBadCertainty), 0); } pblob = ptblob != NULL ? make_ed_blob (ptblob) : NULL; nblob = ntblob != NULL ? make_ed_blob (ntblob) : NULL; (*tess_matcher) (pblob, blob, nblob, tess_word, tess_denorm, ratings); //match it delete blob; //don't need that now if (pblob != NULL) delete pblob; if (nblob != NULL) delete nblob; it.set_to_list (&ratings); //get list result = NULL; for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) { choice = it.data (); choice_lengths[0] = strlen(choice->unichar ()); result = append_choice (result, choice->unichar (), choice_lengths, choice->rating (), choice->certainty (), choice->config ()); } return result; //converted list }
void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings) { int num_blobs_to_replace = 0; int begin_blob_index = 0; int i; // Rating and certainty for the new BLOB_CHOICE are derived from the // replaced choices. float new_rating = 0.0f; float new_certainty = 0.0f; BLOB_CHOICE* old_choice = nullptr; for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) { if (i >= wrong_ngram_begin_index) { int num_blobs = werd_choice->state(i); int col = begin_blob_index + num_blobs_to_replace; int row = col + num_blobs - 1; BLOB_CHOICE_LIST* choices = ratings->get(col, row); ASSERT_HOST(choices != nullptr); old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices); ASSERT_HOST(old_choice != nullptr); new_rating += old_choice->rating(); new_certainty += old_choice->certainty(); num_blobs_to_replace += num_blobs; } else { begin_blob_index += werd_choice->state(i); } } new_certainty /= wrong_ngram_size; // If there is no entry in the ratings matrix, add it. MATRIX_COORD coord(begin_blob_index, begin_blob_index + num_blobs_to_replace - 1); if (!coord.Valid(*ratings)) { ratings->IncreaseBandSize(coord.row - coord.col + 1); } if (ratings->get(coord.col, coord.row) == nullptr) ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST); BLOB_CHOICE_LIST* new_choices = ratings->get(coord.col, coord.row); BLOB_CHOICE* choice = FindMatchingChoice(correct_ngram_id, new_choices); if (choice != nullptr) { // Already there. Upgrade if new rating better. if (new_rating < choice->rating()) choice->set_rating(new_rating); if (new_certainty < choice->certainty()) choice->set_certainty(new_certainty); // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState. } else { // Need a new choice with the correct_ngram_id. choice = new BLOB_CHOICE(*old_choice); choice->set_unichar_id(correct_ngram_id); choice->set_rating(new_rating); choice->set_certainty(new_certainty); choice->set_classifier(BCC_AMBIG); choice->set_matrix_cell(coord.col, coord.row); BLOB_CHOICE_IT it (new_choices); it.add_to_end(choice); } // Remove current unichar from werd_choice. On the last iteration // set the correct replacement unichar instead of removing a unichar. for (int replaced_count = 0; replaced_count < wrong_ngram_size; ++replaced_count) { if (replaced_count + 1 == wrong_ngram_size) { werd_choice->set_blob_choice(wrong_ngram_begin_index, num_blobs_to_replace, choice); } else { werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1); } } if (stopper_debug_level >= 1) { werd_choice->print("ReplaceAmbig() "); tprintf("Modified blob_choices: "); print_ratings_list("\n", new_choices, getUnicharset()); } }
/// Recursive helper to find a match to the target_text (from text_index /// position) in the choices (from choices_pos position). /// @param choices is an array of GenericVectors, of length choices_length, /// with each element representing a starting position in the word, and the /// #GenericVector holding classification results for a sequence of consecutive /// blobs, with index 0 being a single blob, index 1 being 2 blobs etc. /// @param choices_pos /// @param choices_length /// @param target_text /// @param text_index /// @param rating /// @param segmentation /// @param best_rating /// @param best_segmentation void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices, int choices_pos, int choices_length, const GenericVector<UNICHAR_ID>& target_text, int text_index, float rating, GenericVector<int>* segmentation, float* best_rating, GenericVector<int>* best_segmentation) { const UnicharAmbigsVector& table = getDict().getUnicharAmbigs().dang_ambigs(); for (int length = 1; length <= choices[choices_pos].size(); ++length) { // Rating of matching choice or worst choice if no match. float choice_rating = 0.0f; // Find the corresponding best BLOB_CHOICE. BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]); for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { BLOB_CHOICE* choice = choice_it.data(); choice_rating = choice->rating(); UNICHAR_ID class_id = choice->unichar_id(); if (class_id == target_text[text_index]) { break; } // Search ambigs table. if (class_id < table.size() && table[class_id] != NULL) { AmbigSpec_IT spec_it(table[class_id]); for (spec_it.mark_cycle_pt(); !spec_it.cycled_list(); spec_it.forward()) { const AmbigSpec *ambig_spec = spec_it.data(); // We'll only do 1-1. if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID && ambig_spec->correct_ngram_id == target_text[text_index]) break; } if (!spec_it.cycled_list()) break; // Found an ambig. } } if (choice_it.cycled_list()) continue; // No match. segmentation->push_back(length); if (choices_pos + length == choices_length && text_index + 1 == target_text.size()) { // This is a complete match. If the rating is good record a new best. if (applybox_debug > 2) { tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n", rating + choice_rating, *best_rating, segmentation->size(), best_segmentation->size()); } if (best_segmentation->empty() || rating + choice_rating < *best_rating) { *best_segmentation = *segmentation; *best_rating = rating + choice_rating; } } else if (choices_pos + length < choices_length && text_index + 1 < target_text.size()) { if (applybox_debug > 3) { tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n", target_text[text_index], unicharset.id_to_unichar(target_text[text_index]), choice_it.data()->unichar_id() == target_text[text_index] ? "Match" : "Ambig", choices_pos, length); } SearchForText(choices, choices_pos + length, choices_length, target_text, text_index + 1, rating + choice_rating, segmentation, best_rating, best_segmentation); if (applybox_debug > 3) { tprintf("End recursion for %d=%s\n", target_text[text_index], unicharset.id_to_unichar(target_text[text_index])); } } segmentation->truncate(segmentation->size() - 1); } }