INT16 resegment_box( // ROW *row, BOX box, char *ch, INT16 block_id, INT16 row_id, INT16 boxfile_lineno, INT16 boxfile_charno) { WERD_IT word_it; WERD *word; WERD *new_word = NULL; BOOL8 polyg = false; PBLOB_IT blob_it; PBLOB_IT new_blob_it; PBLOB *blob; PBLOB *new_blob; OUTLINE_IT outline_it; OUTLINE_LIST dummy; // Just to initialize new_outline_it. OUTLINE_IT new_outline_it = &dummy; OUTLINE *outline; BOX new_word_box; float word_x_centre; float baseline; INT16 error_count = 0; //number of chars lost word_it.set_to_list (row->word_list ()); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); polyg = word->flag (W_POLYGON); if (word->bounding_box ().overlap (box)) { blob_it.set_to_list (word->gblob_list ()); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (gblob_bounding_box (blob, polyg).overlap (box)) { outline_it.set_to_list (gblob_out_list (blob, polyg)); for (outline_it.mark_cycle_pt (); !outline_it.cycled_list (); outline_it.forward ()) { outline = outline_it.data (); if (goutline_bounding_box (outline, polyg). major_overlap (box)) { if (strlen (word->text ()) > 0) { if (error_count == 0) { error_count = 1; if (applybox_debug > 4) report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "FAILURE! box overlaps blob in labelled word"); } if (applybox_debug > 4) tprintf ("APPLY_BOXES: ALSO ignoring corrupted char blk:%d row:%d \"%s\"\n", block_id, row_id, word_it.data ()->text ()); word_it.data ()->set_text (""); //UN label it error_count++; } if (error_count == 0) { if (new_word == NULL) { /* Make a new word with a single blob */ new_word = word->shallow_copy (); new_word->set_text (ch); if (polyg) new_blob = new PBLOB; else new_blob = (PBLOB *) new C_BLOB; new_blob_it.set_to_list (new_word-> gblob_list ()); new_blob_it.add_to_end (new_blob); new_outline_it. set_to_list (gblob_out_list (new_blob, polyg)); } new_outline_it.add_to_end (outline_it. extract ()); //move blob } } } //no outlines in blob if (outline_it.empty ()) //so delete blob delete blob_it.extract (); } } if (blob_it.empty ()) //no blobs in word //so delete word delete word_it.extract (); } } if (error_count > 0) return error_count; if (new_word != NULL) { gblob_sort_list (new_word->gblob_list (), polyg); word_it.add_to_end (new_word); new_word_box = new_word->bounding_box (); word_x_centre = (new_word_box.left () + new_word_box.right ()) / 2.0f; baseline = row->base_line (word_x_centre); if (STRING (chs_caps_ht).contains (ch[0]) && (new_word_box.top () < baseline + (1 + applybox_error_band) * row->x_height ())) { report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "FAILURE! caps-ht char didn't ascend"); new_word->set_text (""); return 1; } if (STRING (chs_odd_top).contains (ch[0]) && (new_word_box.top () < baseline + (1 - applybox_error_band) * row->x_height ())) { report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "FAILURE! Odd top char below xht"); new_word->set_text (""); return 1; } if (STRING (chs_x_ht).contains (ch[0]) && ((new_word_box.top () > baseline + (1 + applybox_error_band) * row->x_height ()) || (new_word_box.top () < baseline + (1 - applybox_error_band) * row->x_height ()))) { report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "FAILURE! x-ht char didn't have top near xht"); new_word->set_text (""); return 1; } if (STRING (chs_non_ambig_bl).contains (ch[0]) && ((new_word_box.bottom () < baseline - applybox_error_band * row->x_height ()) || (new_word_box.bottom () > baseline + applybox_error_band * row->x_height ()))) { report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "FAILURE! non ambig BL char didnt have bottom near baseline"); new_word->set_text (""); return 1; } if (STRING (chs_odd_bot).contains (ch[0]) && (new_word_box.bottom () > baseline + applybox_error_band * row->x_height ())) { report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "FAILURE! Odd bottom char above baseline"); new_word->set_text (""); return 1; } if (STRING (chs_desc).contains (ch[0]) && (new_word_box.bottom () > baseline - applybox_error_band * row->x_height ())) { report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "FAILURE! Descender doesn't descend"); new_word->set_text (""); return 1; } return 0; } else { report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "FAILURE! Couldn't find any blobs"); return 1; } }
/// Consume all source blobs that strongly overlap the given box, /// putting them into a new word, with the correct_text label. /// Fights over which box owns which blobs are settled by /// applying the blobs to box or next_box with the least non-overlap. /// @return false if the box was in error, which can only be caused by /// failing to find an overlapping blob for a box. bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list, const TBOX& box, const TBOX& next_box, const char* correct_text) { if (applybox_debug > 1) { tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text); } WERD* new_word = NULL; BLOCK_IT b_it(block_list); for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { BLOCK* block = b_it.data(); if (!box.major_overlap(block->bounding_box())) continue; ROW_IT r_it(block->row_list()); for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) { ROW* row = r_it.data(); if (!box.major_overlap(row->bounding_box())) continue; WERD_IT w_it(row->word_list()); for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { WERD* word = w_it.data(); if (applybox_debug > 2) { tprintf("Checking word:"); word->bounding_box().print(); } if (word->text() != NULL && word->text()[0] != '\0') continue; // Ignore words that are already done. if (!box.major_overlap(word->bounding_box())) continue; C_BLOB_IT blob_it(word->cblob_list()); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* blob = blob_it.data(); TBOX blob_box = blob->bounding_box(); if (!blob_box.major_overlap(box)) continue; double current_box_miss_metric = BoxMissMetric(blob_box, box); double next_box_miss_metric = BoxMissMetric(blob_box, next_box); if (applybox_debug > 2) { tprintf("Checking blob:"); blob_box.print(); tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric, next_box_miss_metric); } if (current_box_miss_metric > next_box_miss_metric) continue; // Blob is a better match for next box. if (applybox_debug > 2) { tprintf("Blob match: blob:"); blob_box.print(); tprintf("Matches box:"); box.print(); tprintf("With next box:"); next_box.print(); } if (new_word == NULL) { // Make a new word with a single blob. new_word = word->shallow_copy(); new_word->set_text(correct_text); w_it.add_to_end(new_word); } C_BLOB_IT new_blob_it(new_word->cblob_list()); new_blob_it.add_to_end(blob_it.extract()); } } } } if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n"); return new_word != NULL; }