/********************************************************************** * split_and_recog_word * * Split the word into 2 smaller pieces at the largest gap. * Recognize the pieces and stick the results back together. **********************************************************************/ void Tesseract::split_and_recog_word(WERD_RES *word) { // Find the biggest blob gap in the chopped_word. int bestgap = -MAX_INT32; int split_index = 0; for (int b = 1; b < word->chopped_word->NumBlobs(); ++b) { TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box(); TBOX blob_box = word->chopped_word->blobs[b]->bounding_box(); int gap = blob_box.left() - prev_box.right(); if (gap > bestgap) { bestgap = gap; split_index = b; } } ASSERT_HOST(split_index > 0); WERD_RES *word2 = NULL; BlamerBundle *orig_bb = NULL; split_word(word, split_index, &word2, &orig_bb); // Recognize the first part of the word. recog_word_recursive(word); // Recognize the second part of the word. recog_word_recursive(word2); join_words(word, word2, orig_bb); }
void Tesseract::recog_word(WERD_RES *word) { if (wordrec_skip_no_truth_words && (word->blamer_bundle == NULL || word->blamer_bundle->incorrect_result_reason() == IRR_NO_TRUTH)) { if (classify_debug_level) tprintf("No truth for word - skipping\n"); word->tess_failed = true; return; } ASSERT_HOST(!word->chopped_word->blobs.empty()); recog_word_recursive(word); word->SetupBoxWord(); if (word->best_choice->length() != word->box_word->length()) { tprintf("recog_word ASSERT FAIL String:\"%s\"; " "Strlen=%d; #Blobs=%d\n", word->best_choice->debug_string().string(), word->best_choice->length(), word->box_word->length()); } ASSERT_HOST(word->best_choice->length() == word->box_word->length()); // Check that the ratings matrix size matches the sum of all the // segmentation states. if (!word->StatesAllValid()) { tprintf("Not all words have valid states relative to ratings matrix!!"); word->DebugWordChoices(true, NULL); ASSERT_HOST(word->StatesAllValid()); } if (tessedit_override_permuter) { /* Override the permuter type if a straight dictionary check disagrees. */ uinT8 perm_type = word->best_choice->permuter(); if ((perm_type != SYSTEM_DAWG_PERM) && (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) { uinT8 real_dict_perm_type = dict_word(*word->best_choice); if (((real_dict_perm_type == SYSTEM_DAWG_PERM) || (real_dict_perm_type == FREQ_DAWG_PERM) || (real_dict_perm_type == USER_DAWG_PERM)) && (alpha_count(word->best_choice->unichar_string().string(), word->best_choice->unichar_lengths().string()) > 0)) { word->best_choice->set_permuter(real_dict_perm_type); // use dict perm } } if (tessedit_rejection_debug && perm_type != word->best_choice->permuter()) { tprintf("Permuter Type Flipped from %d to %d\n", perm_type, word->best_choice->permuter()); } } // Factored out from control.cpp ASSERT_HOST((word->best_choice == NULL) == (word->raw_choice == NULL)); if (word->best_choice == NULL || word->best_choice->length() == 0 || static_cast<int>(strspn(word->best_choice->unichar_string().string(), " ")) == word->best_choice->length()) { word->tess_failed = true; word->reject_map.initialise(word->box_word->length()); word->reject_map.rej_word_tess_failure(); } else { word->tess_failed = false; } }
/********************************************************************** * recog_word * * Convert the word to tess form and pass it to the tess segmenter. * Convert the output back to editor form. **********************************************************************/ WERD_CHOICE *recog_word( //recog one owrd WERD *word, //word to do DENORM *denorm, //de-normaliser POLY_MATCHER matcher, //matcher function POLY_TESTER tester, //tester function POLY_TESTER trainer, //trainer function BOOL8 testing, //true if answer driven WERD_CHOICE *&raw_choice, //raw result //list of blob lists BLOB_CHOICE_LIST_CLIST *blob_choices, WERD *&outword //bln word output ) { WERD_CHOICE *word_choice; uinT8 perm_type; uinT8 real_dict_perm_type; if (word->blob_list ()->empty ()) { char empty_lengths[] = {0}; word_choice = new WERD_CHOICE ("", empty_lengths, 10.0f, -1.0f, TOP_CHOICE_PERM); raw_choice = new WERD_CHOICE ("", empty_lengths, 10.0f, -1.0f, TOP_CHOICE_PERM); outword = word->poly_copy (denorm->row ()->x_height ()); } else word_choice = recog_word_recursive (word, denorm, matcher, tester, trainer, testing, raw_choice, blob_choices, outword); if ((word_choice->lengths ().length () != outword->blob_list ()->length ()) || (word_choice->lengths ().length () != blob_choices->length ())) { tprintf ("recog_word ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n", word_choice->string ().string (), word_choice->lengths ().length (), outword->blob_list ()->length (), blob_choices->length ()); } ASSERT_HOST (word_choice->lengths ().length () == outword->blob_list ()->length ()); ASSERT_HOST (word_choice->lengths ().length () == blob_choices->length ()); /* Copy any reject blobs into the outword */ outword->rej_blob_list ()->deep_copy (word->rej_blob_list ()); if (tessedit_override_permuter) { /* Override the permuter type if a straight dictionary check disagrees. */ perm_type = word_choice->permuter (); if ((perm_type != SYSTEM_DAWG_PERM) && (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) { real_dict_perm_type = dict_word (word_choice->string ().string ()); if (((real_dict_perm_type == SYSTEM_DAWG_PERM) || (real_dict_perm_type == FREQ_DAWG_PERM) || (real_dict_perm_type == USER_DAWG_PERM)) && (alpha_count (word_choice->string ().string (), word_choice->lengths ().string ()) > 0)) word_choice->set_permuter (real_dict_perm_type); //Use dict perm } if (tessedit_rejection_debug && perm_type != word_choice->permuter ()) { tprintf ("Permuter Type Flipped from %d to %d\n", perm_type, word_choice->permuter ()); } } assert ((word_choice == NULL) == (raw_choice == NULL)); return word_choice; }
WERD_CHOICE *split_and_recog_word( //recog one owrd WERD *word, //word to do DENORM *denorm, //de-normaliser POLY_MATCHER matcher, //matcher function POLY_TESTER tester, //tester function POLY_TESTER trainer, //trainer function BOOL8 testing, //true if answer driven WERD_CHOICE *&raw_choice, //raw result //list of blob lists BLOB_CHOICE_LIST_CLIST *blob_choices, WERD *&outword //bln word output ) { // inT32 outword1_len; // inT32 outword2_len; WERD *first_word; //poly copy of word WERD *second_word; //fabricated word WERD *outword2; //2nd output word PBLOB *blob; WERD_CHOICE *result; //resturn value WERD_CHOICE *result2; //output of 2nd word WERD_CHOICE *raw_choice2; //raw version of 2nd float gap; //blob gap float bestgap; //biggest gap PBLOB_LIST new_blobs; //list of gathered blobs PBLOB_IT blob_it; //iterator PBLOB_IT new_blob_it = &new_blobs; first_word = word->poly_copy (denorm->row ()->x_height ()); blob_it.set_to_list (first_word->blob_list ()); bestgap = -MAX_INT32; while (!blob_it.at_last ()) { blob = blob_it.data (); //gap to next gap = blob_it.data_relative (1)->bounding_box ().left () - blob->bounding_box ().right (); blob_it.forward (); if (gap > bestgap) { bestgap = gap; //find biggest new_blob_it = blob_it; //save position } } //take 2nd half new_blobs.assign_to_sublist (&new_blob_it, &blob_it); //make it a word second_word = new WERD (&new_blobs, 1, NULL); ASSERT_HOST (word->blob_list ()->length () == first_word->blob_list ()->length () + second_word->blob_list ()->length ()); result = recog_word_recursive (first_word, denorm, matcher, tester, trainer, testing, raw_choice, blob_choices, outword); delete first_word; //done that one result2 = recog_word_recursive (second_word, denorm, matcher, tester, trainer, testing, raw_choice2, blob_choices, outword2); delete second_word; //done that too *result += *result2; //combine ratings delete result2; *raw_choice += *raw_choice2; delete raw_choice2; //finished with it // outword1_len= outword->blob_list()->length(); // outword2_len= outword2->blob_list()->length(); outword->join_on (outword2); //join words delete outword2; // if ( outword->blob_list()->length() != outword1_len + outword2_len ) // tprintf( "Split&Recog: part1len=%d; part2len=%d; combinedlen=%d\n", // outword1_len, outword2_len, outword->blob_list()->length() ); // ASSERT_HOST( outword->blob_list()->length() == outword1_len + outword2_len ); return result; }
/** * Try splitting off the given number of (chopped) blobs from the front and * back of the given word and recognizing the pieces. * * @param[in] num_chopped_leading how many chopped blobs from the left * end of the word to chop off and try recognizing as a * superscript (or subscript) * @param[in] leading_certainty the (minimum) certainty had by the * characters in the original leading section. * @param[in] leading_pos "super" or "sub" (for debugging) * @param[in] num_chopped_trailing how many chopped blobs from the right * end of the word to chop off and try recognizing as a * superscript (or subscript) * @param[in] trailing_certainty the (minimum) certainty had by the * characters in the original trailing section. * @param[in] trailing_pos "super" or "sub" (for debugging) * @param[in] word the word to try to chop up. * @param[out] is_good do we believe our result? * @param[out] retry_rebuild_leading, retry_rebuild_trailing * If non-zero, and !is_good, then the caller may have luck trying * to split the returned word with this number of (rebuilt) leading * and trailing blobs / unichars. * @return A word which is the result of re-recognizing as asked. */ WERD_RES *Tesseract::TrySuperscriptSplits( int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_rebuild_leading, int *retry_rebuild_trailing) { int num_chopped = word->chopped_word->NumBlobs(); *retry_rebuild_leading = *retry_rebuild_trailing = 0; // Chop apart the word into up to three pieces. BlamerBundle *bb0 = NULL; BlamerBundle *bb1 = NULL; WERD_RES *prefix = NULL; WERD_RES *core = NULL; WERD_RES *suffix = NULL; if (num_chopped_leading > 0) { prefix = new WERD_RES(*word); split_word(prefix, num_chopped_leading, &core, &bb0); } else { core = new WERD_RES(*word); } if (num_chopped_trailing > 0) { int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading; split_word(core, split_pt, &suffix, &bb1); } // Recognize the pieces in turn. int saved_cp_multiplier = classify_class_pruner_multiplier; int saved_im_multiplier = classify_integer_matcher_multiplier; if (prefix) { // Turn off Tesseract's y-position penalties for the leading superscript. classify_class_pruner_multiplier.set_value(0); classify_integer_matcher_multiplier.set_value(0); // Adjust our expectations about the baseline for this prefix. if (superscript_debug >= 3) { tprintf(" recognizing first %d chopped blobs\n", num_chopped_leading); } recog_word_recursive(prefix); if (superscript_debug >= 2) { tprintf(" The leading bits look like %s %s\n", ScriptPosToString(leading_pos), prefix->best_choice->unichar_string().string()); } // Restore the normal y-position penalties. classify_class_pruner_multiplier.set_value(saved_cp_multiplier); classify_integer_matcher_multiplier.set_value(saved_im_multiplier); } if (superscript_debug >= 3) { tprintf(" recognizing middle %d chopped blobs\n", num_chopped - num_chopped_leading - num_chopped_trailing); } if (suffix) { // Turn off Tesseract's y-position penalties for the trailing superscript. classify_class_pruner_multiplier.set_value(0); classify_integer_matcher_multiplier.set_value(0); if (superscript_debug >= 3) { tprintf(" recognizing last %d chopped blobs\n", num_chopped_trailing); } recog_word_recursive(suffix); if (superscript_debug >= 2) { tprintf(" The trailing bits look like %s %s\n", ScriptPosToString(trailing_pos), suffix->best_choice->unichar_string().string()); } // Restore the normal y-position penalties. classify_class_pruner_multiplier.set_value(saved_cp_multiplier); classify_integer_matcher_multiplier.set_value(saved_im_multiplier); } // Evaluate whether we think the results are believably better // than what we already had. bool good_prefix = !prefix || BelievableSuperscript( superscript_debug >= 1, *prefix, superscript_bettered_certainty * leading_certainty, retry_rebuild_leading, NULL); bool good_suffix = !suffix || BelievableSuperscript( superscript_debug >= 1, *suffix, superscript_bettered_certainty * trailing_certainty, NULL, retry_rebuild_trailing); *is_good = good_prefix && good_suffix; if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) { // None of it is any good. Quit now. delete core; delete prefix; delete suffix; return NULL; } recog_word_recursive(core); // Now paste the results together into core. if (suffix) { suffix->SetAllScriptPositions(trailing_pos); join_words(core, suffix, bb1); } if (prefix) { prefix->SetAllScriptPositions(leading_pos); join_words(prefix, core, bb0); core = prefix; prefix = NULL; } if (superscript_debug >= 1) { tprintf("%s superscript fix: %s\n", *is_good ? "ACCEPT" : "REJECT", core->best_choice->unichar_string().string()); } return core; }