void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST* real_rows) { TO_ROW_IT to_row_it(rows); TO_ROW* row = to_row_it.data(); // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready // to create the word. C_BLOB_LIST cblobs; C_BLOB_IT cblob_it(&cblobs); BLOBNBOX_IT box_it(row->blob_list()); for (;!box_it.empty(); box_it.forward()) { BLOBNBOX* bblob= box_it.extract(); if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) { if (bblob->cblob() != NULL) { C_OUTLINE_IT cout_it(cblob_it.data()->out_list()); cout_it.move_to_last(); cout_it.add_list_after(bblob->cblob()->out_list()); delete bblob->cblob(); } } else { if (bblob->cblob() != NULL) cblob_it.add_after_then_move(bblob->cblob()); delete bblob; } } // Convert the TO_ROW to a ROW. ROW* real_row = new ROW(row, static_cast<inT16>(row->kern_size), static_cast<inT16>(row->space_size)); WERD_IT word_it(real_row->word_list()); WERD* word = new WERD(&cblobs, 0, NULL); word->set_flag(W_BOL, TRUE); word->set_flag(W_EOL, TRUE); word_it.add_after_then_move(word); ROW_IT row_it(real_rows); row_it.add_after_then_move(real_row); }
WERD *add_repeated_word( //move repeated word WERD_IT *rep_it, //repeated words int16_t &rep_left, //left edge of word int16_t &prev_chop_coord, //previous word end uint8_t &blanks, //no of blanks float pitch, //char cell size WERD_IT *word_it //list of words ) { WERD *word; //word to move int16_t new_blanks; //extra blanks if (rep_left > prev_chop_coord) { new_blanks = (uint8_t) floor ((rep_left - prev_chop_coord) / pitch + 0.5); blanks += new_blanks; } word = rep_it->extract (); prev_chop_coord = word->bounding_box ().right (); word_it->add_after_then_move (word); word->set_blanks (blanks); rep_it->forward (); if (rep_it->empty ()) rep_left = INT16_MAX; else rep_left = rep_it->data ()->bounding_box ().left (); blanks = 0; return word; }
void PrintSegmentationStats(BLOCK_LIST* block_list) { int num_blocks = 0; int num_rows = 0; int num_words = 0; int num_blobs = 0; BLOCK_IT block_it(block_list); for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { BLOCK* block = block_it.data(); ++num_blocks; ROW_IT row_it(block->row_list()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { ++num_rows; ROW* row = row_it.data(); // Iterate over all werds in the row. WERD_IT werd_it(row->word_list()); for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) { WERD* werd = werd_it.data(); ++num_words; num_blobs += werd->cblob_list()->length(); } } } tprintf("Block list stats:\nBlocks = %d\nRows = %d\nWords = %d\nBlobs = %d\n", num_blocks, num_rows, num_words, num_blobs); }
PAGE_RES_IT* make_pseudo_word(PAGE_RES* page_res, const TBOX& selection_box) { PAGE_RES_IT pr_it(page_res); C_BLOB_LIST new_blobs; // list of gathered blobs C_BLOB_IT new_blob_it = &new_blobs; // iterator for (WERD_RES* word_res = pr_it.word(); word_res != NULL; word_res = pr_it.forward()) { WERD* word = word_res->word; if (word->bounding_box().overlap(selection_box)) { C_BLOB_IT blob_it(word->cblob_list()); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* blob = blob_it.data(); if (blob->bounding_box().overlap(selection_box)) { new_blob_it.add_after_then_move(C_BLOB::deep_copy(blob)); } } if (!new_blobs.empty()) { WERD* pseudo_word = new WERD(&new_blobs, 1, NULL); word_res = pr_it.InsertSimpleCloneWord(*word_res, pseudo_word); PAGE_RES_IT* it = new PAGE_RES_IT(page_res); while (it->word() != word_res && it->word() != NULL) it->forward(); ASSERT_HOST(it->word() == word_res); return it; } } } return NULL; }
/// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: /// All fuzzy spaces are removed, and all the words are maximally chopped. PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes, BLOCK_LIST *block_list) { PreenXHeights(block_list); // Strip all fuzzy space markers to simplify the PAGE_RES. BLOCK_IT b_it(block_list); for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { BLOCK* block = b_it.data(); ROW_IT r_it(block->row_list()); for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) { ROW* row = r_it.data(); WERD_IT w_it(row->word_list()); for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { WERD* word = w_it.data(); if (word->cblob_list()->empty()) { delete w_it.extract(); } else { word->set_flag(W_FUZZY_SP, false); word->set_flag(W_FUZZY_NON, false); } } } } PAGE_RES* page_res = new PAGE_RES(false, block_list, NULL); PAGE_RES_IT pr_it(page_res); WERD_RES* word_res; while ((word_res = pr_it.word()) != NULL) { MaximallyChopWord(boxes, pr_it.block()->block, pr_it.row()->row, word_res); pr_it.forward(); } return page_res; }
// Remove outlines that are a tiny fraction in either width or height // of the word height. void Textord::clean_small_noise_from_words(ROW *row) { WERD_IT word_it(row->word_list()); for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { WERD* word = word_it.data(); int min_size = static_cast<int>( textord_noise_hfract * word->bounding_box().height() + 0.5); C_BLOB_IT blob_it(word->cblob_list()); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* blob = blob_it.data(); C_OUTLINE_IT out_it(blob->out_list()); for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) { C_OUTLINE* outline = out_it.data(); outline->RemoveSmallRecursive(min_size, &out_it); } if (blob->out_list()->empty()) { delete blob_it.extract(); } } if (word->cblob_list()->empty()) { if (!word_it.at_last()) { // The next word is no longer a fuzzy non space if it was before, // since the word before is about to be deleted. WERD* next_word = word_it.data_relative(1); if (next_word->flag(W_FUZZY_NON)) { next_word->set_flag(W_FUZZY_NON, false); } } delete word_it.extract(); } } }
//yangjing01 modified : bool TAL_make_single_word(bool one_blob, TO_ROW_LIST* rows, ROW_LIST* real_rows) { TO_ROW_IT to_row_it(rows); ROW_IT row_it(real_rows); //to_real_row is the real row information of single row or single char mode TO_ROW* real_to_row = NULL; float row_max_height = 0.0; for (to_row_it.mark_cycle_pt(); !to_row_it.cycled_list(); to_row_it.forward()){ TO_ROW* row = to_row_it.data(); float row_min_y = row->min_y(); float row_max_y = row->max_y(); float row_height = abs(row_max_y - row_min_y); if (real_to_row == NULL || row_height > row_max_height || fabs(row_height - row_max_height) < 1.0f){ row_max_height = row_height; real_to_row = row; } } if (real_to_row == NULL){ return false; } C_BLOB_LIST cblobs; C_BLOB_IT cblob_it(&cblobs); BLOBNBOX_IT box_it(real_to_row->blob_list()); for (; !box_it.empty(); box_it.forward()){ BLOBNBOX* bblob = box_it.extract(); if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) { if (bblob->cblob() != NULL){ C_OUTLINE_IT cout_it(cblob_it.data()->out_list()); cout_it.move_to_last(); cout_it.add_list_after(bblob->cblob()->out_list()); delete bblob->cblob(); } } else { if (bblob->cblob() != NULL) cblob_it.add_after_then_move(bblob->cblob()); } delete bblob; } // Convert the TO_ROW to a ROW. ROW* real_row = new ROW(real_to_row, static_cast<inT16>(real_to_row->kern_size), static_cast<inT16>(real_to_row->space_size)); WERD_IT word_it(real_row->word_list()); WERD* word = new WERD(&cblobs, 0, NULL); word->set_flag(W_BOL, TRUE); word->set_flag(W_EOL, TRUE); word->set_flag(W_DONT_CHOP, one_blob); word_it.add_after_then_move(word); row_it.add_after_then_move(real_row); return true; }
WERD *make_real_word(BLOBNBOX_IT *box_it, //iterator inT32 blobcount, //no of blobs to use BOOL8 bol, //start of line uinT8 blanks //no of blanks ) { OUTLINE_IT out_it; // outlines C_OUTLINE_IT cout_it; PBLOB_LIST blobs; // blobs in word C_BLOB_LIST cblobs; PBLOB_IT blob_it = &blobs; // iterator C_BLOB_IT cblob_it = &cblobs; WERD *word; // new word BLOBNBOX *bblob; // current blob inT32 blobindex; // in row for (blobindex = 0; blobindex < blobcount; blobindex++) { bblob = box_it->extract(); if (bblob->joined_to_prev()) { if (bblob->blob() != NULL) { out_it.set_to_list(blob_it.data()->out_list()); out_it.move_to_last(); out_it.add_list_after(bblob->blob()->out_list()); delete bblob->blob(); } else if (bblob->cblob() != NULL) { cout_it.set_to_list(cblob_it.data()->out_list()); cout_it.move_to_last(); cout_it.add_list_after(bblob->cblob()->out_list()); delete bblob->cblob(); } } else { if (bblob->blob() != NULL) blob_it.add_after_then_move(bblob->blob()); else if (bblob->cblob() != NULL) cblob_it.add_after_then_move(bblob->cblob()); } delete bblob; box_it->forward(); // next one } if (blanks < 1) blanks = 1; if (blob_it.empty()) word = new WERD(&cblobs, blanks, NULL); else word = new WERD(&blobs, blanks, NULL); if (bol) word->set_flag(W_BOL, TRUE); if (box_it->at_first()) word->set_flag(W_EOL, TRUE); // at end of line return word; }
void ROW::plot( //draw it ScrollView* window //window to draw in ) { WERD *word; //current word WERD_IT it = &words; //words of ROW for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) { word = it.data (); word->plot (window); //in rainbow colours } }
void ROW::plot( //draw it ScrollView* window, //window to draw in ScrollView::Color colour //colour to draw in ) { WERD *word; //current word WERD_IT it = &words; //words of ROW for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) { word = it.data (); word->plot (window, colour); //all in one colour } }
/** * @name process_selected_words() * * Walk the current block list applying the specified word processor function * to each word that overlaps the selection_box. */ void Tesseract::process_selected_words( PAGE_RES* page_res, // blocks to check TBOX & selection_box, BOOL8(tesseract::Tesseract::*word_processor)(PAGE_RES_IT* pr_it)) { for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != NULL; page_res_it.forward()) { WERD* word = page_res_it.word()->word; if (word->bounding_box().overlap(selection_box)) { if (!(this->*word_processor)(&page_res_it)) return; } } }
void apply_box_training(BLOCK_LIST *block_list) { BLOCK_IT block_it(block_list); ROW_IT row_it; ROW *row; WERD_IT word_it; WERD *word; WERD *bln_word; WERD copy_outword; // copy to denorm PBLOB_IT blob_it; DENORM denorm; INT16 count = 0; char ch[2]; ch[1] = '\0'; tprintf ("Generating training data\n"); for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { row_it.set_to_list (block_it.data ()->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); word_it.set_to_list (row->word_list ()); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); if ((strlen (word->text ()) == 1) && (word->gblob_list ()->length () == 1)) { /* Here is a word with a single char label and a single blob so train on it */ bln_word = make_bln_copy (word, row, row->x_height (), &denorm); blob_it.set_to_list (bln_word->blob_list ()); ch[0] = *word->text (); tess_training_tester (blob_it.data (), //single blob &denorm, TRUE, //correct ch, //correct ASCII char 1, //ASCII length NULL); copy_outword = *(bln_word); copy_outword.baseline_denormalise (&denorm); blob_it.set_to_list (copy_outword.blob_list ()); ch[0] = *word->text (); delete bln_word; count++; } } } } tprintf ("Generated training data for %d blobs\n", count); }
WERD *make_real_word(BLOBNBOX_IT *box_it, //iterator int32_t blobcount, //no of blobs to use bool bol, //start of line uint8_t blanks //no of blanks ) { C_OUTLINE_IT cout_it; C_BLOB_LIST cblobs; C_BLOB_IT cblob_it = &cblobs; WERD *word; // new word BLOBNBOX *bblob; // current blob int32_t blobindex; // in row for (blobindex = 0; blobindex < blobcount; blobindex++) { bblob = box_it->extract(); if (bblob->joined_to_prev()) { if (bblob->cblob() != nullptr) { cout_it.set_to_list(cblob_it.data()->out_list()); cout_it.move_to_last(); cout_it.add_list_after(bblob->cblob()->out_list()); delete bblob->cblob(); } } else { if (bblob->cblob() != nullptr) cblob_it.add_after_then_move(bblob->cblob()); } delete bblob; box_it->forward(); // next one } if (blanks < 1) blanks = 1; word = new WERD(&cblobs, blanks, nullptr); if (bol) word->set_flag(W_BOL, true); if (box_it->at_first()) word->set_flag(W_EOL, true); // at end of line return word; }
void RefreshWordBlobsFromNewBlobs(BLOCK_LIST* block_list, C_BLOB_LIST* new_blobs, C_BLOB_LIST* not_found_blobs) { // Now iterate over all the blobs in the segmentation_block_list_, and just // replace the corresponding c-blobs inside the werds. BLOCK_IT block_it(block_list); for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { BLOCK* block = block_it.data(); if (block->poly_block() != NULL && !block->poly_block()->IsText()) continue; // Don't touch non-text blocks. // Iterate over all rows in the block. ROW_IT row_it(block->row_list()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { ROW* row = row_it.data(); // Iterate over all werds in the row. WERD_IT werd_it(row->word_list()); WERD_LIST new_words; WERD_IT new_words_it(&new_words); for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) { WERD* werd = werd_it.extract(); WERD* new_werd = werd->ConstructWerdWithNewBlobs(new_blobs, not_found_blobs); if (new_werd) { // Insert this new werd into the actual row's werd-list. Remove the // existing one. new_words_it.add_after_then_move(new_werd); delete werd; } else { // Reinsert the older word back, for lack of better options. // This is critical since dropping the words messes up segmentation: // eg. 1st word in the row might otherwise have W_FUZZY_NON turned on. new_words_it.add_after_then_move(werd); } } // Get rid of the old word list & replace it with the new one. row->word_list()->clear(); werd_it.move_to_first(); werd_it.add_list_after(&new_words); } } }
/** * Returns the baseline of the current object at the given level. * The baseline is the line that passes through (x1, y1) and (x2, y2). * WARNING: with vertical text, baselines may be vertical! */ bool PageIterator::Baseline(PageIteratorLevel level, int* x1, int* y1, int* x2, int* y2) const { if (it_->word() == NULL) return false; // Already at the end! ROW* row = it_->row()->row; WERD* word = it_->word()->word; TBOX box = (level == RIL_WORD || level == RIL_SYMBOL) ? word->bounding_box() : row->bounding_box(); int left = box.left(); ICOORD startpt(left, static_cast<inT16>(row->base_line(left) + 0.5)); int right = box.right(); ICOORD endpt(right, static_cast<inT16>(row->base_line(right) + 0.5)); // Rotate to image coordinates and convert to global image coords. startpt.rotate(it_->block()->block->re_rotation()); endpt.rotate(it_->block()->block->re_rotation()); *x1 = startpt.x() / scale_ + rect_left_; *y1 = (rect_height_ - startpt.y()) / scale_ + rect_top_; *x2 = endpt.x() / scale_ + rect_left_; *y2 = (rect_height_ - endpt.y()) / scale_ + rect_top_; return true; }
void ExtractBlobsFromSegmentation(BLOCK_LIST* blocks, C_BLOB_LIST* output_blob_list) { C_BLOB_IT return_list_it(output_blob_list); BLOCK_IT block_it(blocks); for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { BLOCK* block = block_it.data(); ROW_IT row_it(block->row_list()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { ROW* row = row_it.data(); // Iterate over all werds in the row. WERD_IT werd_it(row->word_list()); for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) { WERD* werd = werd_it.data(); return_list_it.move_to_last(); return_list_it.add_list_after(werd->cblob_list()); return_list_it.move_to_last(); return_list_it.add_list_after(werd->rej_cblob_list()); } } } }
// Fixes the block so it obeys all the rules: // Must have at least one ROW. // Must have at least one WERD. // WERDs contain a fake blob. void Textord::cleanup_nontext_block(BLOCK* block) { // Non-text blocks must contain at least one row. ROW_IT row_it(block->row_list()); if (row_it.empty()) { const TBOX& box = block->pdblk.bounding_box(); float height = box.height(); int32_t xstarts[2] = {box.left(), box.right()}; double coeffs[3] = {0.0, 0.0, static_cast<double>(box.bottom())}; ROW* row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f, height / 4.0f, 0, 1); row_it.add_after_then_move(row); } // Each row must contain at least one word. for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { ROW* row = row_it.data(); WERD_IT w_it(row->word_list()); if (w_it.empty()) { // Make a fake blob to put in the word. TBOX box = block->row_list()->singleton() ? block->pdblk.bounding_box() : row->bounding_box(); C_BLOB* blob = C_BLOB::FakeBlob(box); C_BLOB_LIST blobs; C_BLOB_IT blob_it(&blobs); blob_it.add_after_then_move(blob); WERD* word = new WERD(&blobs, 0, nullptr); w_it.add_after_then_move(word); } // Each word must contain a fake blob. for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { WERD* word = w_it.data(); // Just assert that this is true, as it would be useful to find // out why it isn't. ASSERT_HOST(!word->cblob_list()->empty()); } row->recalc_bounding_box(); } }
/** * word_set_display() Word processor * * Display word according to current display mode settings */ BOOL8 Tesseract::word_set_display(BLOCK* block, ROW* row, WERD_RES* word_res) { WERD* word = word_res->word; word->set_display_flag(DF_BOX, word_display_mode.bit(DF_BOX)); word->set_display_flag(DF_TEXT, word_display_mode.bit(DF_TEXT)); word->set_display_flag(DF_POLYGONAL, word_display_mode.bit(DF_POLYGONAL)); word->set_display_flag(DF_EDGE_STEP, word_display_mode.bit(DF_EDGE_STEP)); word->set_display_flag(DF_BN_POLYGONAL, word_display_mode.bit(DF_BN_POLYGONAL)); word->set_display_flag(DF_BLAMER, word_display_mode.bit(DF_BLAMER)); return word_display(block, row, word_res); }
/** * word_set_display() Word processor * * Display word according to current display mode settings */ BOOL8 Tesseract::word_set_display(PAGE_RES_IT* pr_it) { WERD* word = pr_it->word()->word; word->set_display_flag(DF_BOX, word_display_mode.bit(DF_BOX)); word->set_display_flag(DF_TEXT, word_display_mode.bit(DF_TEXT)); word->set_display_flag(DF_POLYGONAL, word_display_mode.bit(DF_POLYGONAL)); word->set_display_flag(DF_EDGE_STEP, word_display_mode.bit(DF_EDGE_STEP)); word->set_display_flag(DF_BN_POLYGONAL, word_display_mode.bit(DF_BN_POLYGONAL)); word->set_display_flag(DF_BLAMER, word_display_mode.bit(DF_BLAMER)); return word_display(pr_it); }
void ROW::recalc_bounding_box() { //recalculate BB WERD *word; //current word WERD_IT it = &words; //words of ROW inT16 left; //of word inT16 prev_left; //old left if (!it.empty ()) { word = it.data (); prev_left = word->bounding_box ().left (); it.forward (); while (!it.at_first ()) { word = it.data (); left = word->bounding_box ().left (); if (left < prev_left) { it.move_to_first (); //words in BB order it.sort (word_comparator); break; } prev_left = left; it.forward (); } } for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) { word = it.data (); if (it.at_first ()) word->set_flag (W_BOL, TRUE); else //not start of line word->set_flag (W_BOL, FALSE); if (it.at_last ()) word->set_flag (W_EOL, TRUE); else //not end of line word->set_flag (W_EOL, FALSE); //extend BB as reqd bound_box += word->bounding_box (); } }
/// Resegments the words by running the classifier in an attempt to find the /// correct segmentation that produces the required string. void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) { PAGE_RES_IT pr_it(page_res); WERD_RES* word_res; for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) { WERD* word = word_res->word; if (word->text() == NULL || word->text()[0] == '\0') continue; // Ignore words that have no text. // Convert the correct text to a vector of UNICHAR_ID GenericVector<UNICHAR_ID> target_text; if (!ConvertStringToUnichars(word->text(), &target_text)) { tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n", word->text()); pr_it.DeleteCurrentWord(); continue; } if (!FindSegmentation(target_text, word_res)) { tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n", word->text()); pr_it.DeleteCurrentWord(); continue; } } }
/********************************************************************** * fixed_pitch_words * * Make a ROW from a fixed pitch TO_ROW. **********************************************************************/ ROW *fixed_pitch_words( //find lines TO_ROW *row, //row to do FCOORD rotation //for drawing ) { bool bol; //start of line uint8_t blanks; //in front of word uint8_t new_blanks; //blanks in empty cell int16_t chop_coord; //chop boundary int16_t prev_chop_coord; //start of cell int16_t rep_left; //left edge of rep word ROW *real_row; //output row C_OUTLINE_LIST left_coutlines; C_OUTLINE_LIST right_coutlines; C_BLOB_LIST cblobs; C_BLOB_IT cblob_it = &cblobs; WERD_LIST words; WERD_IT word_it = &words; //new words //repeated blobs WERD_IT rep_it = &row->rep_words; WERD *word; //new word int32_t xstarts[2]; //row ends int32_t prev_x; //end of prev blob //iterator BLOBNBOX_IT box_it = row->blob_list (); //boundaries ICOORDELT_IT cell_it = &row->char_cells; #ifndef GRAPHICS_DISABLED if (textord_show_page_cuts && to_win != nullptr) { plot_row_cells (to_win, ScrollView::RED, row, 0, &row->char_cells); } #endif prev_x = -INT16_MAX; bol = true; blanks = 0; if (rep_it.empty ()) rep_left = INT16_MAX; else rep_left = rep_it.data ()->bounding_box ().left (); if (box_it.empty ()) return nullptr; //empty row xstarts[0] = box_it.data ()->bounding_box ().left (); if (rep_left < xstarts[0]) { xstarts[0] = rep_left; } if (cell_it.empty () || row->char_cells.singleton ()) { tprintf ("Row without enough char cells!\n"); tprintf ("Leftmost blob is at (%d,%d)\n", box_it.data ()->bounding_box ().left (), box_it.data ()->bounding_box ().bottom ()); return nullptr; } ASSERT_HOST (!cell_it.empty () && !row->char_cells.singleton ()); prev_chop_coord = cell_it.data ()->x (); word = nullptr; while (rep_left < cell_it.data ()->x ()) { word = add_repeated_word (&rep_it, rep_left, prev_chop_coord, blanks, row->fixed_pitch, &word_it); } cell_it.mark_cycle_pt (); if (prev_chop_coord >= cell_it.data ()->x ()) cell_it.forward (); for (; !cell_it.cycled_list (); cell_it.forward ()) { chop_coord = cell_it.data ()->x (); while (!box_it.empty () && box_it.data ()->bounding_box ().left () <= chop_coord) { if (box_it.data ()->bounding_box ().right () > prev_x) prev_x = box_it.data ()->bounding_box ().right (); split_to_blob (box_it.extract (), chop_coord, textord_fp_chop_error + 0.5f, &left_coutlines, &right_coutlines); box_it.forward (); while (!box_it.empty() && box_it.data()->cblob() == nullptr) { delete box_it.extract(); box_it.forward(); } } if (!right_coutlines.empty() && left_coutlines.empty()) split_to_blob (nullptr, chop_coord, textord_fp_chop_error + 0.5f, &left_coutlines, &right_coutlines); if (!left_coutlines.empty()) { cblob_it.add_after_then_move(new C_BLOB(&left_coutlines)); } else { if (rep_left < chop_coord) { if (rep_left > prev_chop_coord) new_blanks = (uint8_t) floor ((rep_left - prev_chop_coord) / row->fixed_pitch + 0.5); else new_blanks = 0; } else { if (chop_coord > prev_chop_coord) new_blanks = (uint8_t) floor ((chop_coord - prev_chop_coord) / row->fixed_pitch + 0.5); else new_blanks = 0; } if (!cblob_it.empty()) { if (blanks < 1 && word != nullptr && !word->flag (W_REP_CHAR)) blanks = 1; word = new WERD (&cblobs, blanks, nullptr); cblob_it.set_to_list (&cblobs); word->set_flag (W_DONT_CHOP, TRUE); word_it.add_after_then_move (word); if (bol) { word->set_flag (W_BOL, TRUE); bol = false; } blanks = new_blanks; } else blanks += new_blanks; while (rep_left < chop_coord) { word = add_repeated_word (&rep_it, rep_left, prev_chop_coord, blanks, row->fixed_pitch, &word_it); } } if (prev_chop_coord < chop_coord) prev_chop_coord = chop_coord; } if (!cblob_it.empty()) { word = new WERD(&cblobs, blanks, nullptr); word->set_flag (W_DONT_CHOP, TRUE); word_it.add_after_then_move (word); if (bol) word->set_flag (W_BOL, TRUE); } ASSERT_HOST (word != nullptr); while (!rep_it.empty ()) { add_repeated_word (&rep_it, rep_left, prev_chop_coord, blanks, row->fixed_pitch, &word_it); } //at end of line word_it.data ()->set_flag (W_EOL, TRUE); if (prev_chop_coord > prev_x) prev_x = prev_chop_coord; xstarts[1] = prev_x + 1; real_row = new ROW (row, (int16_t) row->kern_size, (int16_t) row->space_size); word_it.set_to_list (real_row->word_list ()); //put words in row word_it.add_list_after (&words); real_row->recalc_bounding_box (); return real_row; }
/// Consume all source blobs that strongly overlap the given box, /// putting them into a new word, with the correct_text label. /// Fights over which box owns which blobs are settled by /// applying the blobs to box or next_box with the least non-overlap. /// @return false if the box was in error, which can only be caused by /// failing to find an overlapping blob for a box. bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list, const TBOX& box, const TBOX& next_box, const char* correct_text) { if (applybox_debug > 1) { tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text); } WERD* new_word = NULL; BLOCK_IT b_it(block_list); for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { BLOCK* block = b_it.data(); if (!box.major_overlap(block->bounding_box())) continue; ROW_IT r_it(block->row_list()); for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) { ROW* row = r_it.data(); if (!box.major_overlap(row->bounding_box())) continue; WERD_IT w_it(row->word_list()); for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { WERD* word = w_it.data(); if (applybox_debug > 2) { tprintf("Checking word:"); word->bounding_box().print(); } if (word->text() != NULL && word->text()[0] != '\0') continue; // Ignore words that are already done. if (!box.major_overlap(word->bounding_box())) continue; C_BLOB_IT blob_it(word->cblob_list()); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* blob = blob_it.data(); TBOX blob_box = blob->bounding_box(); if (!blob_box.major_overlap(box)) continue; double current_box_miss_metric = BoxMissMetric(blob_box, box); double next_box_miss_metric = BoxMissMetric(blob_box, next_box); if (applybox_debug > 2) { tprintf("Checking blob:"); blob_box.print(); tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric, next_box_miss_metric); } if (current_box_miss_metric > next_box_miss_metric) continue; // Blob is a better match for next box. if (applybox_debug > 2) { tprintf("Blob match: blob:"); blob_box.print(); tprintf("Matches box:"); box.print(); tprintf("With next box:"); next_box.print(); } if (new_word == NULL) { // Make a new word with a single blob. new_word = word->shallow_copy(); new_word->set_text(correct_text); w_it.add_to_end(new_word); } C_BLOB_IT new_blob_it(new_word->cblob_list()); new_blob_it.add_to_end(blob_it.extract()); } } } } if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n"); return new_word != NULL; }
/** * word_display() Word Processor * * Display a word according to its display modes */ BOOL8 Tesseract::word_display(BLOCK* block, ROW* row, WERD_RES* word_res) { WERD* word = word_res->word; TBOX word_bb; // word bounding box int word_height; // ht of word BB BOOL8 displayed_something = FALSE; float shift; // from bot left C_BLOB_IT c_it; // cblob iterator if (color_mode != CM_RAINBOW && word_res->box_word != NULL) { BoxWord* box_word = word_res->box_word; int length = box_word->length(); if (word_res->fontinfo == NULL) return false; const FontInfo& font_info = *word_res->fontinfo; for (int i = 0; i < length; ++i) { ScrollView::Color color = ScrollView::GREEN; switch (color_mode) { case CM_SUBSCRIPT: if (box_word->BlobPosition(i) == SP_SUBSCRIPT) color = ScrollView::RED; break; case CM_SUPERSCRIPT: if (box_word->BlobPosition(i) == SP_SUPERSCRIPT) color = ScrollView::RED; break; case CM_ITALIC: if (font_info.is_italic()) color = ScrollView::RED; break; case CM_BOLD: if (font_info.is_bold()) color = ScrollView::RED; break; case CM_FIXEDPITCH: if (font_info.is_fixed_pitch()) color = ScrollView::RED; break; case CM_SERIF: if (font_info.is_serif()) color = ScrollView::RED; break; case CM_SMALLCAPS: if (word_res->small_caps) color = ScrollView::RED; break; case CM_DROPCAPS: if (box_word->BlobPosition(i) == SP_DROPCAP) color = ScrollView::RED; break; // TODO(rays) underline is currently completely unsupported. case CM_UNDERLINE: default: break; } image_win->Pen(color); TBOX box = box_word->BlobBox(i); image_win->Rectangle(box.left(), box.bottom(), box.right(), box.top()); } return true; } /* Note the double coercions of(COLOUR)((inT32)editor_image_word_bb_color) etc. are to keep the compiler happy. */ // display bounding box if (word->display_flag(DF_BOX)) { word->bounding_box().plot(image_win, (ScrollView::Color)((inT32) editor_image_word_bb_color), (ScrollView::Color)((inT32) editor_image_word_bb_color)); ScrollView::Color c = (ScrollView::Color) ((inT32) editor_image_blob_bb_color); image_win->Pen(c); c_it.set_to_list(word->cblob_list()); for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) c_it.data()->bounding_box().plot(image_win); displayed_something = TRUE; } // display edge steps if (word->display_flag(DF_EDGE_STEP)) { // edgesteps available word->plot(image_win); // rainbow colors displayed_something = TRUE; } // display poly approx if (word->display_flag(DF_POLYGONAL)) { // need to convert TWERD* tword = TWERD::PolygonalCopy(word); tword->plot(image_win); delete tword; displayed_something = TRUE; } // Display correct text and blamer information. STRING text; STRING blame; if (word->display_flag(DF_TEXT) && word->text() != NULL) { text = word->text(); } if (word->display_flag(DF_BLAMER) && !(word_res->blamer_bundle != NULL && word_res->blamer_bundle->incorrect_result_reason == IRR_CORRECT)) { text = ""; const BlamerBundle *blamer_bundle = word_res->blamer_bundle; if (blamer_bundle == NULL) { text += "NULL"; } else { for (int i = 0; i < blamer_bundle->truth_text.length(); ++i) { text += blamer_bundle->truth_text[i]; } } text += " -> "; STRING best_choice_str; if (word_res->best_choice == NULL) { best_choice_str = "NULL"; } else { word_res->best_choice->string_and_lengths(&best_choice_str, NULL); } text += best_choice_str; IncorrectResultReason reason = (blamer_bundle == NULL) ? IRR_PAGE_LAYOUT : blamer_bundle->incorrect_result_reason; ASSERT_HOST(reason < IRR_NUM_REASONS) blame += " ["; blame += BlamerBundle::IncorrectReasonName(reason); blame += "]"; } if (text.length() > 0) { word_bb = word->bounding_box(); image_win->Pen(ScrollView::RED); word_height = word_bb.height(); int text_height = 0.50 * word_height; if (text_height > 20) text_height = 20; image_win->TextAttributes("Arial", text_height, false, false, false); shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f; image_win->Text(word_bb.left() + shift, word_bb.bottom() + 0.25 * word_height, text.string()); if (blame.length() > 0) { image_win->Text(word_bb.left() + shift, word_bb.bottom() + 0.25 * word_height - text_height, blame.string()); } displayed_something = TRUE; } if (!displayed_something) // display BBox anyway word->bounding_box().plot(image_win, (ScrollView::Color)((inT32) editor_image_word_bb_color), (ScrollView::Color)((inT32) editor_image_word_bb_color)); return TRUE; }
void apply_box_testing(BLOCK_LIST *block_list) { BLOCK_IT block_it(block_list); ROW_IT row_it; ROW *row; INT16 row_count = 0; WERD_IT word_it; WERD *word; WERD *bln_word; INT16 word_count = 0; PBLOB_IT blob_it; DENORM denorm; INT16 count = 0; char ch[2]; WERD *outword; //bln best choice //segmentation WERD_CHOICE *best_choice; //tess output WERD_CHOICE *raw_choice; //top choice permuter //detailed results BLOB_CHOICE_LIST_CLIST blob_choices; INT16 char_count = 0; INT16 correct_count = 0; INT16 err_count = 0; INT16 rej_count = 0; #ifndef SECURE_NAMES WERDSTATS wordstats; //As from newdiff #endif char tess_rej_str[3]; char tess_long_str[3]; ch[1] = '\0'; strcpy (tess_rej_str, "|A"); strcpy (tess_long_str, "|B"); for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { row_it.set_to_list (block_it.data ()->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); row_count++; word_count = 0; word_it.set_to_list (row->word_list ()); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); word_count++; if ((strlen (word->text ()) == 1) && !STRING (applybox_test_exclusions).contains (*word->text ()) && (word->gblob_list ()->length () == 1)) { /* Here is a word with a single char label and a single blob so test it */ bln_word = make_bln_copy (word, row, row->x_height (), &denorm); blob_it.set_to_list (bln_word->blob_list ()); ch[0] = *word->text (); char_count++; best_choice = tess_segment_pass1 (bln_word, &denorm, tess_default_matcher, raw_choice, &blob_choices, outword); /* Test for TESS screw up on word. Recog_word has already ensured that the choice list, outword blob lists and best_choice string are the same length. A TESS screw up is indicated by a blank filled or 0 length string. */ if ((best_choice->string ().length () == 0) || (strspn (best_choice->string ().string (), " ") == best_choice->string ().length ())) { rej_count++; tprintf ("%d:%d: \"%s\" -> TESS FAILED\n", row_count, word_count, ch); #ifndef SECURE_NAMES wordstats.word (tess_rej_str, 2, ch, 1); #endif } else { if ((best_choice->string ().length () != outword->blob_list ()->length ()) || (best_choice->string ().length () != blob_choices.length ())) { tprintf ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n", best_choice->string ().string (), best_choice->string ().length (), outword->blob_list ()->length (), blob_choices.length ()); } ASSERT_HOST (best_choice->string ().length () == outword->blob_list ()->length ()); ASSERT_HOST (best_choice->string ().length () == blob_choices.length ()); fix_quotes ((char *) best_choice->string ().string (), //turn to double outword, &blob_choices); if (strcmp (best_choice->string ().string (), ch) != 0) { err_count++; tprintf ("%d:%d: \"%s\" -> \"%s\"\n", row_count, word_count, ch, best_choice->string ().string ()); } else correct_count++; #ifndef SECURE_NAMES if (best_choice->string ().length () > 2) wordstats.word (tess_long_str, 2, ch, 1); else wordstats.word ((char *) best_choice->string (). string (), best_choice->string ().length (), ch, 1); #endif } delete bln_word; delete outword; delete best_choice; delete raw_choice; blob_choices.deep_clear (); count++; } } } } #ifndef SECURE_NAMES wordstats.print (1, 100.0); wordstats.conf_matrix (); tprintf ("Tested %d chars: %d correct; %d rejected by tess; %d errs\n", char_count, correct_count, rej_count, err_count); #endif }
/************************************************************************* * tidy_up() * - report >1 block * - sort the words in each row. * - report any rows with no labelled words. * - report any remaining unlabelled words * - report total labelled words * *************************************************************************/ void tidy_up( // BLOCK_LIST *block_list, //real blocks INT16 &ok_char_count, INT16 &ok_row_count, INT16 &unlabelled_words, INT16 *tgt_char_counts, INT16 &rebalance_count, char &min_char, INT16 &min_samples, INT16 &final_labelled_blob_count) { BLOCK_IT block_it(block_list); ROW_IT row_it; ROW *row; WERD_IT word_it; WERD *word; WERD *duplicate_word; INT16 block_idx = 0; INT16 row_idx; INT16 all_row_idx = 0; BOOL8 row_ok; BOOL8 rebalance_needed = FALSE; //No. of unique labelled samples INT16 labelled_char_counts[128]; INT16 i; char ch; char prev_ch = '\0'; BOOL8 at_dupe_of_prev_word; ROW *prev_row = NULL; INT16 left; INT16 prev_left = -1; for (i = 0; i < 128; i++) labelled_char_counts[i] = 0; ok_char_count = 0; ok_row_count = 0; unlabelled_words = 0; if ((applybox_debug > 4) && (block_it.length () != 1)) tprintf ("APPLY_BOXES: More than one block??\n"); for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { block_idx++; row_idx = 0; row_ok = FALSE; row_it.set_to_list (block_it.data ()->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row_idx++; all_row_idx++; row = row_it.data (); word_it.set_to_list (row->word_list ()); word_it.sort (word_comparator); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); if (strlen (word->text ()) == 0) { unlabelled_words++; if (applybox_debug > 4) { tprintf ("APPLY_BOXES: Unlabelled word blk:%d row:%d allrows:%d\n", block_idx, row_idx, all_row_idx); } } else { if (word->gblob_list ()->length () != 1) tprintf ("APPLY_BOXES: FATALITY - MULTIBLOB Labelled word blk:%d row:%d allrows:%d\n", block_idx, row_idx, all_row_idx); ok_char_count++; labelled_char_counts[*word->text ()]++; row_ok = TRUE; } } if ((applybox_debug > 4) && (!row_ok)) { tprintf ("APPLY_BOXES: Row with no labelled words blk:%d row:%d allrows:%d\n", block_idx, row_idx, all_row_idx); } else ok_row_count++; } } min_samples = 9999; for (i = 0; i < 128; i++) { if (tgt_char_counts[i] > labelled_char_counts[i]) { if (labelled_char_counts[i] <= 1) { tprintf ("APPLY_BOXES: FATALITY - %d labelled samples of \"%c\" - target is %d\n", labelled_char_counts[i], (char) i, tgt_char_counts[i]); } else { rebalance_needed = TRUE; if (applybox_debug > 0) tprintf ("APPLY_BOXES: REBALANCE REQD \"%c\" - target of %d from %d labelled samples\n", (char) i, tgt_char_counts[i], labelled_char_counts[i]); } } if ((min_samples > labelled_char_counts[i]) && (tgt_char_counts[i] > 0)) { min_samples = labelled_char_counts[i]; min_char = (char) i; } } while (applybox_rebalance && rebalance_needed) { block_it.set_to_list (block_list); for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { row_it.set_to_list (block_it.data ()->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); word_it.set_to_list (row->word_list ()); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); left = word->bounding_box ().left (); ch = *word->text (); at_dupe_of_prev_word = ((row == prev_row) && (left = prev_left) && (ch == prev_ch)); if ((ch != '\0') && (labelled_char_counts[ch] > 1) && (tgt_char_counts[ch] > labelled_char_counts[ch]) && (!at_dupe_of_prev_word)) { /* Duplicate the word to rebalance the labelled samples */ if (applybox_debug > 9) { tprintf ("Duping \"%c\" from ", ch); word->bounding_box ().print (); } duplicate_word = new WERD; *duplicate_word = *word; word_it.add_after_then_move (duplicate_word); rebalance_count++; labelled_char_counts[ch]++; } prev_row = row; prev_left = left; prev_ch = ch; } } } rebalance_needed = FALSE; for (i = 0; i < 128; i++) { if ((tgt_char_counts[i] > labelled_char_counts[i]) && (labelled_char_counts[i] > 1)) { rebalance_needed = TRUE; break; } } } /* Now final check - count labelled blobs */ final_labelled_blob_count = 0; block_it.set_to_list (block_list); for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { row_it.set_to_list (block_it.data ()->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); word_it.set_to_list (row->word_list ()); word_it.sort (word_comparator); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); if ((strlen (word->text ()) == 1) && (word->gblob_list ()->length () == 1)) final_labelled_blob_count++; } } } }
INT16 resegment_box( // ROW *row, BOX box, char *ch, INT16 block_id, INT16 row_id, INT16 boxfile_lineno, INT16 boxfile_charno) { WERD_IT word_it; WERD *word; WERD *new_word = NULL; BOOL8 polyg = false; PBLOB_IT blob_it; PBLOB_IT new_blob_it; PBLOB *blob; PBLOB *new_blob; OUTLINE_IT outline_it; OUTLINE_LIST dummy; // Just to initialize new_outline_it. OUTLINE_IT new_outline_it = &dummy; OUTLINE *outline; BOX new_word_box; float word_x_centre; float baseline; INT16 error_count = 0; //number of chars lost word_it.set_to_list (row->word_list ()); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); polyg = word->flag (W_POLYGON); if (word->bounding_box ().overlap (box)) { blob_it.set_to_list (word->gblob_list ()); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (gblob_bounding_box (blob, polyg).overlap (box)) { outline_it.set_to_list (gblob_out_list (blob, polyg)); for (outline_it.mark_cycle_pt (); !outline_it.cycled_list (); outline_it.forward ()) { outline = outline_it.data (); if (goutline_bounding_box (outline, polyg). major_overlap (box)) { if (strlen (word->text ()) > 0) { if (error_count == 0) { error_count = 1; if (applybox_debug > 4) report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "FAILURE! box overlaps blob in labelled word"); } if (applybox_debug > 4) tprintf ("APPLY_BOXES: ALSO ignoring corrupted char blk:%d row:%d \"%s\"\n", block_id, row_id, word_it.data ()->text ()); word_it.data ()->set_text (""); //UN label it error_count++; } if (error_count == 0) { if (new_word == NULL) { /* Make a new word with a single blob */ new_word = word->shallow_copy (); new_word->set_text (ch); if (polyg) new_blob = new PBLOB; else new_blob = (PBLOB *) new C_BLOB; new_blob_it.set_to_list (new_word-> gblob_list ()); new_blob_it.add_to_end (new_blob); new_outline_it. set_to_list (gblob_out_list (new_blob, polyg)); } new_outline_it.add_to_end (outline_it. extract ()); //move blob } } } //no outlines in blob if (outline_it.empty ()) //so delete blob delete blob_it.extract (); } } if (blob_it.empty ()) //no blobs in word //so delete word delete word_it.extract (); } } if (error_count > 0) return error_count; if (new_word != NULL) { gblob_sort_list (new_word->gblob_list (), polyg); word_it.add_to_end (new_word); new_word_box = new_word->bounding_box (); word_x_centre = (new_word_box.left () + new_word_box.right ()) / 2.0f; baseline = row->base_line (word_x_centre); if (STRING (chs_caps_ht).contains (ch[0]) && (new_word_box.top () < baseline + (1 + applybox_error_band) * row->x_height ())) { report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "FAILURE! caps-ht char didn't ascend"); new_word->set_text (""); return 1; } if (STRING (chs_odd_top).contains (ch[0]) && (new_word_box.top () < baseline + (1 - applybox_error_band) * row->x_height ())) { report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "FAILURE! Odd top char below xht"); new_word->set_text (""); return 1; } if (STRING (chs_x_ht).contains (ch[0]) && ((new_word_box.top () > baseline + (1 + applybox_error_band) * row->x_height ()) || (new_word_box.top () < baseline + (1 - applybox_error_band) * row->x_height ()))) { report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "FAILURE! x-ht char didn't have top near xht"); new_word->set_text (""); return 1; } if (STRING (chs_non_ambig_bl).contains (ch[0]) && ((new_word_box.bottom () < baseline - applybox_error_band * row->x_height ()) || (new_word_box.bottom () > baseline + applybox_error_band * row->x_height ()))) { report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "FAILURE! non ambig BL char didnt have bottom near baseline"); new_word->set_text (""); return 1; } if (STRING (chs_odd_bot).contains (ch[0]) && (new_word_box.bottom () > baseline + applybox_error_band * row->x_height ())) { report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "FAILURE! Odd bottom char above baseline"); new_word->set_text (""); return 1; } if (STRING (chs_desc).contains (ch[0]) && (new_word_box.bottom () > baseline - applybox_error_band * row->x_height ())) { report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "FAILURE! Descender doesn't descend"); new_word->set_text (""); return 1; } return 0; } else { report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "FAILURE! Couldn't find any blobs"); return 1; } }
ROW *find_row_of_box( // BLOCK_LIST *block_list, //real blocks BOX box, //from boxfile INT16 &block_id, INT16 &row_id_to_process) { BLOCK_IT block_it(block_list); BLOCK *block; ROW_IT row_it; ROW *row; ROW *row_to_process = NULL; INT16 row_id; WERD_IT word_it; WERD *word; BOOL8 polyg; PBLOB_IT blob_it; PBLOB *blob; OUTLINE_IT outline_it; OUTLINE *outline; /* Find row to process - error if box REALLY overlaps more than one row. (I.e it overlaps blobs in the row - not just overlaps the bounding box of the whole row.) */ block_id = 0; for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { block_id++; row_id = 0; block = block_it.data (); if (block->bounding_box ().overlap (box)) { row_it.set_to_list (block->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row_id++; row = row_it.data (); if (row->bounding_box ().overlap (box)) { word_it.set_to_list (row->word_list ()); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); polyg = word->flag (W_POLYGON); if (word->bounding_box ().overlap (box)) { blob_it.set_to_list (word->gblob_list ()); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (gblob_bounding_box (blob, polyg). overlap (box)) { outline_it. set_to_list (gblob_out_list (blob, polyg)); for (outline_it.mark_cycle_pt (); !outline_it.cycled_list (); outline_it.forward ()) { outline = outline_it.data (); if (goutline_bounding_box (outline, polyg).major_overlap (box)) { if ((row_to_process == NULL) || (row_to_process == row)) { row_to_process = row; row_id_to_process = row_id; } else /* RETURN ERROR Box overlaps blobs in more than one row */ return NULL; } } } } } } } } } } return row_to_process; }
bool Textord::clean_noise_from_row( //remove empties ROW* row //row to clean ) { bool testing_on; TBOX blob_box; //bounding box C_BLOB *blob; //current blob C_OUTLINE *outline; //current outline WERD *word; //current word int32_t blob_size; //biggest size int32_t trans_count = 0; //no of transitions int32_t trans_threshold; //noise tolerance int32_t dot_count; //small objects int32_t norm_count; //normal objects int32_t super_norm_count; //real char-like //words of row WERD_IT word_it = row->word_list (); C_BLOB_IT blob_it; //blob iterator C_OUTLINE_IT out_it; //outline iterator testing_on = textord_test_y > row->base_line (textord_test_x) && textord_show_blobs && textord_test_y < row->base_line (textord_test_x) + row->x_height (); dot_count = 0; norm_count = 0; super_norm_count = 0; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); //current word //blobs in word blob_it.set_to_list (word->cblob_list ()); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (!word->flag (W_DONT_CHOP)) { //get outlines out_it.set_to_list (blob->out_list ()); for (out_it.mark_cycle_pt (); !out_it.cycled_list (); out_it.forward ()) { outline = out_it.data (); blob_box = outline->bounding_box (); blob_size = blob_box.width () > blob_box.height ()? blob_box.width () : blob_box. height(); if (blob_size < textord_noise_sizelimit * row->x_height ()) dot_count++; //count smal outlines if (!outline->child ()->empty () && blob_box.height () < (1 + textord_noise_syfract) * row->x_height () && blob_box.height () > (1 - textord_noise_syfract) * row->x_height () && blob_box.width () < (1 + textord_noise_sxfract) * row->x_height () && blob_box.width () > (1 - textord_noise_sxfract) * row->x_height ()) super_norm_count++; //count smal outlines } } else super_norm_count++; blob_box = blob->bounding_box (); blob_size = blob_box.width () > blob_box.height ()? blob_box.width () : blob_box.height (); if (blob_size >= textord_noise_sizelimit * row->x_height () && blob_size < row->x_height () * 2) { trans_threshold = blob_size / textord_noise_sizefraction; trans_count = blob->count_transitions (trans_threshold); if (trans_count < textord_noise_translimit) norm_count++; } else if (blob_box.height () > row->x_height () * 2 && (!word_it.at_first () || !blob_it.at_first ())) dot_count += 2; if (testing_on) { tprintf ("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n", blob_box.left (), blob_box.bottom (), blob_box.right (), blob_box.top (), blob->out_list ()->length (), trans_count, blob_box.bottom () - row->base_line (blob_box.left ())); } } } if (textord_noise_debug) { tprintf ("Row ending at (%d,%g):", blob_box.right (), row->base_line (blob_box.right ())); tprintf (" R=%g, dc=%d, nc=%d, %s\n", norm_count > 0 ? (float) dot_count / norm_count : 9999, dot_count, norm_count, dot_count > norm_count * textord_noise_normratio && dot_count > 2 ? "REJECTED" : "ACCEPTED"); } return super_norm_count < textord_noise_sncount && dot_count > norm_count * textord_noise_rowratio && dot_count > 2; }
void Textord::clean_noise_from_words( //remove empties ROW *row //row to clean ) { TBOX blob_box; //bounding box C_BLOB *blob; //current blob C_OUTLINE *outline; //current outline WERD *word; //current word int32_t blob_size; //biggest size int32_t trans_count; //no of transitions int32_t trans_threshold; //noise tolerance int32_t dot_count; //small objects int32_t norm_count; //normal objects int32_t dud_words; //number discarded int32_t ok_words; //number remaining int32_t word_index; //current word //words of row WERD_IT word_it = row->word_list (); C_BLOB_IT blob_it; //blob iterator C_OUTLINE_IT out_it; //outline iterator ok_words = word_it.length (); if (ok_words == 0 || textord_no_rejects) return; // was it chucked std::vector<int8_t> word_dud(ok_words); dud_words = 0; ok_words = 0; word_index = 0; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); //current word dot_count = 0; norm_count = 0; //blobs in word blob_it.set_to_list (word->cblob_list ()); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (!word->flag (W_DONT_CHOP)) { //get outlines out_it.set_to_list (blob->out_list ()); for (out_it.mark_cycle_pt (); !out_it.cycled_list (); out_it.forward ()) { outline = out_it.data (); blob_box = outline->bounding_box (); blob_size = blob_box.width () > blob_box.height ()? blob_box.width () : blob_box. height(); if (blob_size < textord_noise_sizelimit * row->x_height ()) dot_count++; //count smal outlines if (!outline->child ()->empty () && blob_box.height () < (1 + textord_noise_syfract) * row->x_height () && blob_box.height () > (1 - textord_noise_syfract) * row->x_height () && blob_box.width () < (1 + textord_noise_sxfract) * row->x_height () && blob_box.width () > (1 - textord_noise_sxfract) * row->x_height ()) norm_count++; //count smal outlines } } else norm_count++; blob_box = blob->bounding_box (); blob_size = blob_box.width () > blob_box.height ()? blob_box.width () : blob_box.height (); if (blob_size >= textord_noise_sizelimit * row->x_height () && blob_size < row->x_height () * 2) { trans_threshold = blob_size / textord_noise_sizefraction; trans_count = blob->count_transitions (trans_threshold); if (trans_count < textord_noise_translimit) norm_count++; } else if (blob_box.height () > row->x_height () * 2 && (!word_it.at_first () || !blob_it.at_first ())) dot_count += 2; } if (dot_count > 2 && !word->flag(W_REP_CHAR)) { if (dot_count > norm_count * textord_noise_normratio * 2) word_dud[word_index] = 2; else if (dot_count > norm_count * textord_noise_normratio) word_dud[word_index] = 1; else word_dud[word_index] = 0; } else { word_dud[word_index] = 0; } if (word_dud[word_index] == 2) dud_words++; else ok_words++; word_index++; } word_index = 0; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { if (word_dud[word_index] == 2 || (word_dud[word_index] == 1 && dud_words > ok_words)) { word = word_it.data(); // Current word. // Previously we threw away the entire word. // Now just aggressively throw all small blobs into the reject list, where // the classifier can decide whether they are actually needed. word->CleanNoise(textord_noise_sizelimit * row->x_height()); } word_index++; } }