void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST* blocks) { BLOCK_IT block_it = blocks; //iterator ROW_IT row_it; //row iterator int num_rows = 0; int num_rows_all = 0; int num_blocks = 0; int num_blocks_all = 0; for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { BLOCK* block = block_it.data(); if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) { cleanup_nontext_block(block); continue; } num_rows = 0; num_rows_all = 0; if (clean_noise) { row_it.set_to_list(block->row_list()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { ROW* row = row_it.data(); ++num_rows_all; clean_small_noise_from_words(row); if ((textord_noise_rejrows && !row->word_list()->empty() && clean_noise_from_row(row)) || row->word_list()->empty()) { delete row_it.extract(); // lose empty row. } else { if (textord_noise_rejwords) clean_noise_from_words(row_it.data()); if (textord_blshift_maxshift >= 0) tweak_row_baseline(row, textord_blshift_maxshift, textord_blshift_xfraction); ++num_rows; } } } if (block->row_list()->empty()) { delete block_it.extract(); // Lose empty text blocks. } else { ++num_blocks; } ++num_blocks_all; if (textord_noise_debug) tprintf("cleanup_blocks: # rows = %d / %d\n", num_rows, num_rows_all); } if (textord_noise_debug) tprintf("cleanup_blocks: # blocks = %d / %d\n", num_blocks, num_blocks_all); }
void PrintSegmentationStats(BLOCK_LIST* block_list) { int num_blocks = 0; int num_rows = 0; int num_words = 0; int num_blobs = 0; BLOCK_IT block_it(block_list); for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { BLOCK* block = block_it.data(); ++num_blocks; ROW_IT row_it(block->row_list()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { ++num_rows; ROW* row = row_it.data(); // Iterate over all werds in the row. WERD_IT werd_it(row->word_list()); for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) { WERD* werd = werd_it.data(); ++num_words; num_blobs += werd->cblob_list()->length(); } } } tprintf("Block list stats:\nBlocks = %d\nRows = %d\nWords = %d\nBlobs = %d\n", num_blocks, num_rows, num_words, num_blobs); }
/// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: /// All fuzzy spaces are removed, and all the words are maximally chopped. PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes, BLOCK_LIST *block_list) { PreenXHeights(block_list); // Strip all fuzzy space markers to simplify the PAGE_RES. BLOCK_IT b_it(block_list); for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { BLOCK* block = b_it.data(); ROW_IT r_it(block->row_list()); for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) { ROW* row = r_it.data(); WERD_IT w_it(row->word_list()); for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { WERD* word = w_it.data(); if (word->cblob_list()->empty()) { delete w_it.extract(); } else { word->set_flag(W_FUZZY_SP, false); word->set_flag(W_FUZZY_NON, false); } } } } PAGE_RES* page_res = new PAGE_RES(false, block_list, NULL); PAGE_RES_IT pr_it(page_res); WERD_RES* word_res; while ((word_res = pr_it.word()) != NULL) { MaximallyChopWord(boxes, pr_it.block()->block, pr_it.row()->row, word_res); pr_it.forward(); } return page_res; }
void RefreshWordBlobsFromNewBlobs(BLOCK_LIST* block_list, C_BLOB_LIST* new_blobs, C_BLOB_LIST* not_found_blobs) { // Now iterate over all the blobs in the segmentation_block_list_, and just // replace the corresponding c-blobs inside the werds. BLOCK_IT block_it(block_list); for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { BLOCK* block = block_it.data(); if (block->poly_block() != NULL && !block->poly_block()->IsText()) continue; // Don't touch non-text blocks. // Iterate over all rows in the block. ROW_IT row_it(block->row_list()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { ROW* row = row_it.data(); // Iterate over all werds in the row. WERD_IT werd_it(row->word_list()); WERD_LIST new_words; WERD_IT new_words_it(&new_words); for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) { WERD* werd = werd_it.extract(); WERD* new_werd = werd->ConstructWerdWithNewBlobs(new_blobs, not_found_blobs); if (new_werd) { // Insert this new werd into the actual row's werd-list. Remove the // existing one. new_words_it.add_after_then_move(new_werd); delete werd; } else { // Reinsert the older word back, for lack of better options. // This is critical since dropping the words messes up segmentation: // eg. 1st word in the row might otherwise have W_FUZZY_NON turned on. new_words_it.add_after_then_move(werd); } } // Get rid of the old word list & replace it with the new one. row->word_list()->clear(); werd_it.move_to_first(); werd_it.add_list_after(&new_words); } } }
void apply_box_training(BLOCK_LIST *block_list) { BLOCK_IT block_it(block_list); ROW_IT row_it; ROW *row; WERD_IT word_it; WERD *word; WERD *bln_word; WERD copy_outword; // copy to denorm PBLOB_IT blob_it; DENORM denorm; INT16 count = 0; char ch[2]; ch[1] = '\0'; tprintf ("Generating training data\n"); for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { row_it.set_to_list (block_it.data ()->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); word_it.set_to_list (row->word_list ()); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); if ((strlen (word->text ()) == 1) && (word->gblob_list ()->length () == 1)) { /* Here is a word with a single char label and a single blob so train on it */ bln_word = make_bln_copy (word, row, row->x_height (), &denorm); blob_it.set_to_list (bln_word->blob_list ()); ch[0] = *word->text (); tess_training_tester (blob_it.data (), //single blob &denorm, TRUE, //correct ch, //correct ASCII char 1, //ASCII length NULL); copy_outword = *(bln_word); copy_outword.baseline_denormalise (&denorm); blob_it.set_to_list (copy_outword.blob_list ()); ch[0] = *word->text (); delete bln_word; count++; } } } } tprintf ("Generated training data for %d blobs\n", count); }
void ExtractBlobsFromSegmentation(BLOCK_LIST* blocks, C_BLOB_LIST* output_blob_list) { C_BLOB_IT return_list_it(output_blob_list); BLOCK_IT block_it(blocks); for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { BLOCK* block = block_it.data(); ROW_IT row_it(block->row_list()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { ROW* row = row_it.data(); // Iterate over all werds in the row. WERD_IT werd_it(row->word_list()); for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) { WERD* werd = werd_it.data(); return_list_it.move_to_last(); return_list_it.add_list_after(werd->cblob_list()); return_list_it.move_to_last(); return_list_it.add_list_after(werd->rej_cblob_list()); } } } }
// Fixes the block so it obeys all the rules: // Must have at least one ROW. // Must have at least one WERD. // WERDs contain a fake blob. void Textord::cleanup_nontext_block(BLOCK* block) { // Non-text blocks must contain at least one row. ROW_IT row_it(block->row_list()); if (row_it.empty()) { const TBOX& box = block->pdblk.bounding_box(); float height = box.height(); int32_t xstarts[2] = {box.left(), box.right()}; double coeffs[3] = {0.0, 0.0, static_cast<double>(box.bottom())}; ROW* row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f, height / 4.0f, 0, 1); row_it.add_after_then_move(row); } // Each row must contain at least one word. for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { ROW* row = row_it.data(); WERD_IT w_it(row->word_list()); if (w_it.empty()) { // Make a fake blob to put in the word. TBOX box = block->row_list()->singleton() ? block->pdblk.bounding_box() : row->bounding_box(); C_BLOB* blob = C_BLOB::FakeBlob(box); C_BLOB_LIST blobs; C_BLOB_IT blob_it(&blobs); blob_it.add_after_then_move(blob); WERD* word = new WERD(&blobs, 0, nullptr); w_it.add_after_then_move(word); } // Each word must contain a fake blob. for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { WERD* word = w_it.data(); // Just assert that this is true, as it would be useful to find // out why it isn't. ASSERT_HOST(!word->cblob_list()->empty()); } row->recalc_bounding_box(); } }
void apply_box_testing(BLOCK_LIST *block_list) { BLOCK_IT block_it(block_list); ROW_IT row_it; ROW *row; INT16 row_count = 0; WERD_IT word_it; WERD *word; WERD *bln_word; INT16 word_count = 0; PBLOB_IT blob_it; DENORM denorm; INT16 count = 0; char ch[2]; WERD *outword; //bln best choice //segmentation WERD_CHOICE *best_choice; //tess output WERD_CHOICE *raw_choice; //top choice permuter //detailed results BLOB_CHOICE_LIST_CLIST blob_choices; INT16 char_count = 0; INT16 correct_count = 0; INT16 err_count = 0; INT16 rej_count = 0; #ifndef SECURE_NAMES WERDSTATS wordstats; //As from newdiff #endif char tess_rej_str[3]; char tess_long_str[3]; ch[1] = '\0'; strcpy (tess_rej_str, "|A"); strcpy (tess_long_str, "|B"); for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { row_it.set_to_list (block_it.data ()->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); row_count++; word_count = 0; word_it.set_to_list (row->word_list ()); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); word_count++; if ((strlen (word->text ()) == 1) && !STRING (applybox_test_exclusions).contains (*word->text ()) && (word->gblob_list ()->length () == 1)) { /* Here is a word with a single char label and a single blob so test it */ bln_word = make_bln_copy (word, row, row->x_height (), &denorm); blob_it.set_to_list (bln_word->blob_list ()); ch[0] = *word->text (); char_count++; best_choice = tess_segment_pass1 (bln_word, &denorm, tess_default_matcher, raw_choice, &blob_choices, outword); /* Test for TESS screw up on word. Recog_word has already ensured that the choice list, outword blob lists and best_choice string are the same length. A TESS screw up is indicated by a blank filled or 0 length string. */ if ((best_choice->string ().length () == 0) || (strspn (best_choice->string ().string (), " ") == best_choice->string ().length ())) { rej_count++; tprintf ("%d:%d: \"%s\" -> TESS FAILED\n", row_count, word_count, ch); #ifndef SECURE_NAMES wordstats.word (tess_rej_str, 2, ch, 1); #endif } else { if ((best_choice->string ().length () != outword->blob_list ()->length ()) || (best_choice->string ().length () != blob_choices.length ())) { tprintf ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n", best_choice->string ().string (), best_choice->string ().length (), outword->blob_list ()->length (), blob_choices.length ()); } ASSERT_HOST (best_choice->string ().length () == outword->blob_list ()->length ()); ASSERT_HOST (best_choice->string ().length () == blob_choices.length ()); fix_quotes ((char *) best_choice->string ().string (), //turn to double outword, &blob_choices); if (strcmp (best_choice->string ().string (), ch) != 0) { err_count++; tprintf ("%d:%d: \"%s\" -> \"%s\"\n", row_count, word_count, ch, best_choice->string ().string ()); } else correct_count++; #ifndef SECURE_NAMES if (best_choice->string ().length () > 2) wordstats.word (tess_long_str, 2, ch, 1); else wordstats.word ((char *) best_choice->string (). string (), best_choice->string ().length (), ch, 1); #endif } delete bln_word; delete outword; delete best_choice; delete raw_choice; blob_choices.deep_clear (); count++; } } } } #ifndef SECURE_NAMES wordstats.print (1, 100.0); wordstats.conf_matrix (); tprintf ("Tested %d chars: %d correct; %d rejected by tess; %d errs\n", char_count, correct_count, rej_count, err_count); #endif }
/************************************************************************* * tidy_up() * - report >1 block * - sort the words in each row. * - report any rows with no labelled words. * - report any remaining unlabelled words * - report total labelled words * *************************************************************************/ void tidy_up( // BLOCK_LIST *block_list, //real blocks INT16 &ok_char_count, INT16 &ok_row_count, INT16 &unlabelled_words, INT16 *tgt_char_counts, INT16 &rebalance_count, char &min_char, INT16 &min_samples, INT16 &final_labelled_blob_count) { BLOCK_IT block_it(block_list); ROW_IT row_it; ROW *row; WERD_IT word_it; WERD *word; WERD *duplicate_word; INT16 block_idx = 0; INT16 row_idx; INT16 all_row_idx = 0; BOOL8 row_ok; BOOL8 rebalance_needed = FALSE; //No. of unique labelled samples INT16 labelled_char_counts[128]; INT16 i; char ch; char prev_ch = '\0'; BOOL8 at_dupe_of_prev_word; ROW *prev_row = NULL; INT16 left; INT16 prev_left = -1; for (i = 0; i < 128; i++) labelled_char_counts[i] = 0; ok_char_count = 0; ok_row_count = 0; unlabelled_words = 0; if ((applybox_debug > 4) && (block_it.length () != 1)) tprintf ("APPLY_BOXES: More than one block??\n"); for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { block_idx++; row_idx = 0; row_ok = FALSE; row_it.set_to_list (block_it.data ()->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row_idx++; all_row_idx++; row = row_it.data (); word_it.set_to_list (row->word_list ()); word_it.sort (word_comparator); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); if (strlen (word->text ()) == 0) { unlabelled_words++; if (applybox_debug > 4) { tprintf ("APPLY_BOXES: Unlabelled word blk:%d row:%d allrows:%d\n", block_idx, row_idx, all_row_idx); } } else { if (word->gblob_list ()->length () != 1) tprintf ("APPLY_BOXES: FATALITY - MULTIBLOB Labelled word blk:%d row:%d allrows:%d\n", block_idx, row_idx, all_row_idx); ok_char_count++; labelled_char_counts[*word->text ()]++; row_ok = TRUE; } } if ((applybox_debug > 4) && (!row_ok)) { tprintf ("APPLY_BOXES: Row with no labelled words blk:%d row:%d allrows:%d\n", block_idx, row_idx, all_row_idx); } else ok_row_count++; } } min_samples = 9999; for (i = 0; i < 128; i++) { if (tgt_char_counts[i] > labelled_char_counts[i]) { if (labelled_char_counts[i] <= 1) { tprintf ("APPLY_BOXES: FATALITY - %d labelled samples of \"%c\" - target is %d\n", labelled_char_counts[i], (char) i, tgt_char_counts[i]); } else { rebalance_needed = TRUE; if (applybox_debug > 0) tprintf ("APPLY_BOXES: REBALANCE REQD \"%c\" - target of %d from %d labelled samples\n", (char) i, tgt_char_counts[i], labelled_char_counts[i]); } } if ((min_samples > labelled_char_counts[i]) && (tgt_char_counts[i] > 0)) { min_samples = labelled_char_counts[i]; min_char = (char) i; } } while (applybox_rebalance && rebalance_needed) { block_it.set_to_list (block_list); for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { row_it.set_to_list (block_it.data ()->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); word_it.set_to_list (row->word_list ()); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); left = word->bounding_box ().left (); ch = *word->text (); at_dupe_of_prev_word = ((row == prev_row) && (left = prev_left) && (ch == prev_ch)); if ((ch != '\0') && (labelled_char_counts[ch] > 1) && (tgt_char_counts[ch] > labelled_char_counts[ch]) && (!at_dupe_of_prev_word)) { /* Duplicate the word to rebalance the labelled samples */ if (applybox_debug > 9) { tprintf ("Duping \"%c\" from ", ch); word->bounding_box ().print (); } duplicate_word = new WERD; *duplicate_word = *word; word_it.add_after_then_move (duplicate_word); rebalance_count++; labelled_char_counts[ch]++; } prev_row = row; prev_left = left; prev_ch = ch; } } } rebalance_needed = FALSE; for (i = 0; i < 128; i++) { if ((tgt_char_counts[i] > labelled_char_counts[i]) && (labelled_char_counts[i] > 1)) { rebalance_needed = TRUE; break; } } } /* Now final check - count labelled blobs */ final_labelled_blob_count = 0; block_it.set_to_list (block_list); for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { row_it.set_to_list (block_it.data ()->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); word_it.set_to_list (row->word_list ()); word_it.sort (word_comparator); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); if ((strlen (word->text ()) == 1) && (word->gblob_list ()->length () == 1)) final_labelled_blob_count++; } } } }
ROW *find_row_of_box( // BLOCK_LIST *block_list, //real blocks BOX box, //from boxfile INT16 &block_id, INT16 &row_id_to_process) { BLOCK_IT block_it(block_list); BLOCK *block; ROW_IT row_it; ROW *row; ROW *row_to_process = NULL; INT16 row_id; WERD_IT word_it; WERD *word; BOOL8 polyg; PBLOB_IT blob_it; PBLOB *blob; OUTLINE_IT outline_it; OUTLINE *outline; /* Find row to process - error if box REALLY overlaps more than one row. (I.e it overlaps blobs in the row - not just overlaps the bounding box of the whole row.) */ block_id = 0; for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { block_id++; row_id = 0; block = block_it.data (); if (block->bounding_box ().overlap (box)) { row_it.set_to_list (block->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row_id++; row = row_it.data (); if (row->bounding_box ().overlap (box)) { word_it.set_to_list (row->word_list ()); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); polyg = word->flag (W_POLYGON); if (word->bounding_box ().overlap (box)) { blob_it.set_to_list (word->gblob_list ()); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (gblob_bounding_box (blob, polyg). overlap (box)) { outline_it. set_to_list (gblob_out_list (blob, polyg)); for (outline_it.mark_cycle_pt (); !outline_it.cycled_list (); outline_it.forward ()) { outline = outline_it.data (); if (goutline_bounding_box (outline, polyg).major_overlap (box)) { if ((row_to_process == NULL) || (row_to_process == row)) { row_to_process = row; row_id_to_process = row_id; } else /* RETURN ERROR Box overlaps blobs in more than one row */ return NULL; } } } } } } } } } } return row_to_process; }
// Groups blocks by rotation, then, for each group, makes a WordGrid and calls // TransferDiacriticsToWords to copy the diacritic blobs to the most // appropriate words in the group of blocks. Source blobs are not touched. void Textord::TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs, BLOCK_LIST* blocks) { // Angle difference larger than this is too much to consider equal. // They should only be in multiples of M_PI/2 anyway. const double kMaxAngleDiff = 0.01; // About 0.6 degrees. PointerVector<BlockGroup> groups; BLOCK_IT bk_it(blocks); for (bk_it.mark_cycle_pt(); !bk_it.cycled_list(); bk_it.forward()) { BLOCK* block = bk_it.data(); if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) { continue; } // Linear search of the groups to find a matching rotation. float block_angle = block->re_rotation().angle(); int best_g = 0; float best_angle_diff = MAX_FLOAT32; for (int g = 0; g < groups.size(); ++g) { double angle_diff = fabs(block_angle - groups[g]->angle); if (angle_diff > M_PI) angle_diff = fabs(angle_diff - 2.0 * M_PI); if (angle_diff < best_angle_diff) { best_angle_diff = angle_diff; best_g = g; } } if (best_angle_diff > kMaxAngleDiff) { groups.push_back(new BlockGroup(block)); } else { groups[best_g]->blocks.push_back(block); groups[best_g]->bounding_box += block->pdblk.bounding_box(); float x_height = block->x_height(); if (x_height < groups[best_g]->min_xheight) groups[best_g]->min_xheight = x_height; } } // Now process each group of blocks. PointerVector<WordWithBox> word_ptrs; for (int g = 0; g < groups.size(); ++g) { const BlockGroup* group = groups[g]; if (group->bounding_box.null_box()) continue; WordGrid word_grid(group->min_xheight, group->bounding_box.botleft(), group->bounding_box.topright()); for (int b = 0; b < group->blocks.size(); ++b) { ROW_IT row_it(group->blocks[b]->row_list()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { ROW* row = row_it.data(); // Put the words of the row into the grid. WERD_IT w_it(row->word_list()); for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { WERD* word = w_it.data(); WordWithBox* box_word = new WordWithBox(word); word_grid.InsertBBox(true, true, box_word); // Save the pointer where it will be auto-deleted. word_ptrs.push_back(box_word); } } } FCOORD rotation = group->rotation; // Make it a forward rotation that will transform blob coords to block. rotation.set_y(-rotation.y()); TransferDiacriticsToWords(diacritic_blobs, rotation, &word_grid); } }
/// Consume all source blobs that strongly overlap the given box, /// putting them into a new word, with the correct_text label. /// Fights over which box owns which blobs are settled by /// applying the blobs to box or next_box with the least non-overlap. /// @return false if the box was in error, which can only be caused by /// failing to find an overlapping blob for a box. bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list, const TBOX& box, const TBOX& next_box, const char* correct_text) { if (applybox_debug > 1) { tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text); } WERD* new_word = NULL; BLOCK_IT b_it(block_list); for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { BLOCK* block = b_it.data(); if (!box.major_overlap(block->bounding_box())) continue; ROW_IT r_it(block->row_list()); for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) { ROW* row = r_it.data(); if (!box.major_overlap(row->bounding_box())) continue; WERD_IT w_it(row->word_list()); for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { WERD* word = w_it.data(); if (applybox_debug > 2) { tprintf("Checking word:"); word->bounding_box().print(); } if (word->text() != NULL && word->text()[0] != '\0') continue; // Ignore words that are already done. if (!box.major_overlap(word->bounding_box())) continue; C_BLOB_IT blob_it(word->cblob_list()); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* blob = blob_it.data(); TBOX blob_box = blob->bounding_box(); if (!blob_box.major_overlap(box)) continue; double current_box_miss_metric = BoxMissMetric(blob_box, box); double next_box_miss_metric = BoxMissMetric(blob_box, next_box); if (applybox_debug > 2) { tprintf("Checking blob:"); blob_box.print(); tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric, next_box_miss_metric); } if (current_box_miss_metric > next_box_miss_metric) continue; // Blob is a better match for next box. if (applybox_debug > 2) { tprintf("Blob match: blob:"); blob_box.print(); tprintf("Matches box:"); box.print(); tprintf("With next box:"); next_box.print(); } if (new_word == NULL) { // Make a new word with a single blob. new_word = word->shallow_copy(); new_word->set_text(correct_text); w_it.add_to_end(new_word); } C_BLOB_IT new_blob_it(new_word->cblob_list()); new_blob_it.add_to_end(blob_it.extract()); } } } } if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n"); return new_word != NULL; }