void draw_meanlines( //draw a block TO_BLOCK *block, //block to draw float gradient, //gradients of lines inT32 left, //edge of block ScrollView::Color colour, //colour to draw in FCOORD rotation //rotation for line ) { FCOORD plot_pt; //point to plot //rows TO_ROW_IT row_it = block->get_rows(); TO_ROW *row; //current row BLOBNBOX_IT blob_it; //blobs float right; //end of row to_win->Pen(colour); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { row = row_it.data(); blob_it.set_to_list(row->blob_list()); blob_it.move_to_last(); right = blob_it.data()->bounding_box().right(); plot_pt = FCOORD((float) left, gradient * left + row->parallel_c() + row->xheight); plot_pt.rotate(rotation); to_win->SetCursor(plot_pt.x(), plot_pt.y()); plot_pt = FCOORD((float) right, gradient * right + row->parallel_c() + row->xheight); plot_pt.rotate(rotation); to_win->DrawTo(plot_pt.x(), plot_pt.y()); } }
// Fits splines to the textlines, or creates fake QSPLINES from the straight // baselines that are already on the TO_ROWs. // As a side-effect, computes the xheights of the rows and the block. // Although x-height estimation is conceptually separate, it is part of // detecting perspective distortion and therefore baseline fitting. void BaselineBlock::FitBaselineSplines(bool enable_splines, bool show_final_rows, Textord* textord) { double gradient = tan(skew_angle_); FCOORD rotation(1.0f, 0.0f); if (enable_splines) { textord->make_spline_rows(block_, gradient, show_final_rows); } else { // Make a fake spline from the existing line. TBOX block_box= block_->block->bounding_box(); TO_ROW_IT row_it = block_->get_rows(); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { TO_ROW* row = row_it.data(); inT32 xstarts[2] = { block_box.left(), block_box.right() }; double coeffs[3] = { 0.0, row->line_m(), row->line_c() }; row->baseline = QSPLINE(1, xstarts, coeffs); textord->compute_row_xheight(row, block_->block->classify_rotation(), row->line_m(), block_->line_size); } } textord->compute_block_xheight(block_, gradient); block_->block->set_xheight(block_->xheight); if (textord_restore_underlines) // fix underlines restore_underlined_blobs(block_); }
void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST* real_rows) { TO_ROW_IT to_row_it(rows); TO_ROW* row = to_row_it.data(); // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready // to create the word. C_BLOB_LIST cblobs; C_BLOB_IT cblob_it(&cblobs); BLOBNBOX_IT box_it(row->blob_list()); for (;!box_it.empty(); box_it.forward()) { BLOBNBOX* bblob= box_it.extract(); if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) { if (bblob->cblob() != NULL) { C_OUTLINE_IT cout_it(cblob_it.data()->out_list()); cout_it.move_to_last(); cout_it.add_list_after(bblob->cblob()->out_list()); delete bblob->cblob(); } } else { if (bblob->cblob() != NULL) cblob_it.add_after_then_move(bblob->cblob()); delete bblob; } } // Convert the TO_ROW to a ROW. ROW* real_row = new ROW(row, static_cast<inT16>(row->kern_size), static_cast<inT16>(row->space_size)); WERD_IT word_it(real_row->word_list()); WERD* word = new WERD(&cblobs, 0, NULL); word->set_flag(W_BOL, TRUE); word->set_flag(W_EOL, TRUE); word_it.add_after_then_move(word); ROW_IT row_it(real_rows); row_it.add_after_then_move(real_row); }
//yangjing01 modified : bool TAL_make_single_word(bool one_blob, TO_ROW_LIST* rows, ROW_LIST* real_rows) { TO_ROW_IT to_row_it(rows); ROW_IT row_it(real_rows); //to_real_row is the real row information of single row or single char mode TO_ROW* real_to_row = NULL; float row_max_height = 0.0; for (to_row_it.mark_cycle_pt(); !to_row_it.cycled_list(); to_row_it.forward()){ TO_ROW* row = to_row_it.data(); float row_min_y = row->min_y(); float row_max_y = row->max_y(); float row_height = abs(row_max_y - row_min_y); if (real_to_row == NULL || row_height > row_max_height || fabs(row_height - row_max_height) < 1.0f){ row_max_height = row_height; real_to_row = row; } } if (real_to_row == NULL){ return false; } C_BLOB_LIST cblobs; C_BLOB_IT cblob_it(&cblobs); BLOBNBOX_IT box_it(real_to_row->blob_list()); for (; !box_it.empty(); box_it.forward()){ BLOBNBOX* bblob = box_it.extract(); if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) { if (bblob->cblob() != NULL){ C_OUTLINE_IT cout_it(cblob_it.data()->out_list()); cout_it.move_to_last(); cout_it.add_list_after(bblob->cblob()->out_list()); delete bblob->cblob(); } } else { if (bblob->cblob() != NULL) cblob_it.add_after_then_move(bblob->cblob()); } delete bblob; } // Convert the TO_ROW to a ROW. ROW* real_row = new ROW(real_to_row, static_cast<inT16>(real_to_row->kern_size), static_cast<inT16>(real_to_row->space_size)); WERD_IT word_it(real_row->word_list()); WERD* word = new WERD(&cblobs, 0, NULL); word->set_flag(W_BOL, TRUE); word->set_flag(W_EOL, TRUE); word->set_flag(W_DONT_CHOP, one_blob); word_it.add_after_then_move(word); row_it.add_after_then_move(real_row); return true; }
void make_real_words( tesseract::Textord *textord, TO_BLOCK *block, //block to do FCOORD rotation //for drawing ) { TO_ROW *row; //current row TO_ROW_IT row_it = block->get_rows (); ROW *real_row = NULL; //output row ROW_IT real_row_it = block->block->row_list (); if (row_it.empty ()) return; //empty block for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); if (row->blob_list ()->empty () && !row->rep_words.empty ()) { real_row = make_rep_words (row, block); } else if (!row->blob_list()->empty()) { // In a fixed pitch document, some lines may be detected as fixed pitch // while others don't, and will go through different path. // For non-space delimited language like CJK, fixed pitch chop always // leave the entire line as one word. We can force consistent chopping // with force_make_prop_words flag. POLY_BLOCK* pb = block->block->poly_block(); if (textord_chopper_test) { real_row = textord->make_blob_words (row, rotation); } else if (textord_force_make_prop_words || (pb != NULL && !pb->IsText()) || row->pitch_decision == PITCH_DEF_PROP || row->pitch_decision == PITCH_CORR_PROP) { real_row = textord->make_prop_words (row, rotation); } else if (row->pitch_decision == PITCH_DEF_FIXED || row->pitch_decision == PITCH_CORR_FIXED) { real_row = fixed_pitch_words (row, rotation); } else { ASSERT_HOST(FALSE); } } if (real_row != NULL) { //put row in block real_row_it.add_after_then_move (real_row); } } block->block->set_stats (block->fixed_pitch == 0, (inT16) block->kern_size, (inT16) block->space_size, (inT16) block->fixed_pitch); block->block->check_pitch (); }
void set_row_spaces( //find space sizes TO_BLOCK *block, //block to do FCOORD rotation, //for drawing BOOL8 testing_on //correct orientation ) { inT32 maxwidth; //of widest space TO_ROW *row; //current row TO_ROW_IT row_it = block->get_rows (); if (row_it.empty ()) return; //empty block maxwidth = (inT32) ceil (block->xheight * textord_words_maxspace); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); if (row->fixed_pitch == 0) { // if (!textord_test_mode // && row_words(block,row,maxwidth,rotation,testing_on)==0 // || textord_test_mode // && row_words2(block,row,maxwidth,rotation,testing_on)==0) // { row->min_space = (inT32) ceil (row->pr_space - (row->pr_space - row->pr_nonsp) * textord_words_definite_spread); row->max_nonspace = (inT32) floor (row->pr_nonsp + (row->pr_space - row->pr_nonsp) * textord_words_definite_spread); if (testing_on && textord_show_initial_words) { tprintf ("Assigning defaults %d non, %d space to row at %g\n", row->max_nonspace, row->min_space, row->intercept ()); } row->space_threshold = (row->max_nonspace + row->min_space) / 2; row->space_size = row->pr_space; row->kern_size = row->pr_nonsp; // } } #ifndef GRAPHICS_DISABLED if (textord_show_initial_words && testing_on) { plot_word_decisions (to_win, (inT16) row->fixed_pitch, row); } #endif } }
void set_row_spaces( //find space sizes TO_BLOCK* block, //block to do FCOORD rotation, //for drawing bool testing_on //correct orientation ) { TO_ROW *row; //current row TO_ROW_IT row_it = block->get_rows (); if (row_it.empty ()) return; //empty block for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); if (row->fixed_pitch == 0) { row->min_space = static_cast<int32_t>(ceil (row->pr_space - (row->pr_space - row->pr_nonsp) * textord_words_definite_spread)); row->max_nonspace = static_cast<int32_t>(floor (row->pr_nonsp + (row->pr_space - row->pr_nonsp) * textord_words_definite_spread)); if (testing_on && textord_show_initial_words) { tprintf ("Assigning defaults %d non, %d space to row at %g\n", row->max_nonspace, row->min_space, row->intercept ()); } row->space_threshold = (row->max_nonspace + row->min_space) / 2; row->space_size = row->pr_space; row->kern_size = row->pr_nonsp; } #ifndef GRAPHICS_DISABLED if (textord_show_initial_words && testing_on) { plot_word_decisions (to_win, static_cast<int16_t>(row->fixed_pitch), row); } #endif } }
void restore_underlined_blobs( //get chop points TO_BLOCK *block //block to do ) { int16_t chop_coord; //chop boundary TBOX blob_box; //of underline BLOBNBOX *u_line; //underline bit TO_ROW *row; //best row for blob ICOORDELT_LIST chop_cells; //blobs to cut out //real underlines BLOBNBOX_LIST residual_underlines; C_OUTLINE_LIST left_coutlines; C_OUTLINE_LIST right_coutlines; ICOORDELT_IT cell_it = &chop_cells; //under lines BLOBNBOX_IT under_it = &block->underlines; BLOBNBOX_IT ru_it = &residual_underlines; if (block->get_rows()->empty()) return; // Don't crash if there are no rows. for (under_it.mark_cycle_pt (); !under_it.cycled_list (); under_it.forward ()) { u_line = under_it.extract (); blob_box = u_line->bounding_box (); row = most_overlapping_row (block->get_rows (), u_line); if (row == nullptr) return; // Don't crash if there is no row. find_underlined_blobs (u_line, &row->baseline, row->xheight, row->xheight * textord_underline_offset, &chop_cells); cell_it.set_to_list (&chop_cells); for (cell_it.mark_cycle_pt (); !cell_it.cycled_list (); cell_it.forward ()) { chop_coord = cell_it.data ()->x (); if (cell_it.data ()->y () - chop_coord > textord_fp_chop_error + 1) { split_to_blob (u_line, chop_coord, textord_fp_chop_error + 0.5, &left_coutlines, &right_coutlines); if (!left_coutlines.empty()) { ru_it.add_after_then_move(new BLOBNBOX(new C_BLOB(&left_coutlines))); } chop_coord = cell_it.data ()->y (); split_to_blob(nullptr, chop_coord, textord_fp_chop_error + 0.5, &left_coutlines, &right_coutlines); if (!left_coutlines.empty()) { row->insert_blob(new BLOBNBOX(new C_BLOB(&left_coutlines))); } u_line = nullptr; //no more blobs to add } delete cell_it.extract(); } if (!right_coutlines.empty ()) { split_to_blob(nullptr, blob_box.right(), textord_fp_chop_error + 0.5, &left_coutlines, &right_coutlines); if (!left_coutlines.empty()) ru_it.add_after_then_move(new BLOBNBOX(new C_BLOB(&left_coutlines))); } if (u_line != nullptr) { delete u_line->cblob(); delete u_line; } } if (!ru_it.empty()) { ru_it.move_to_first(); for (ru_it.mark_cycle_pt(); !ru_it.cycled_list(); ru_it.forward()) { under_it.add_after_then_move(ru_it.extract()); } } }
GAPMAP::GAPMAP( //Constructor TO_BLOCK *block //block ) { TO_ROW_IT row_it; //row iterator TO_ROW *row; //current row BLOBNBOX_IT blob_it; //iterator TBOX blob_box; TBOX prev_blob_box; inT16 gap_width; inT16 start_of_row; inT16 end_of_row; STATS xht_stats (0, 128); inT16 min_quantum; inT16 max_quantum; inT16 i; row_it.set_to_list (block->get_rows ()); /* Find left and right extremes and bucket size */ map = NULL; min_left = MAX_INT16; max_right = -MAX_INT16; total_rows = 0; any_tabs = FALSE; for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); if (!row->blob_list ()->empty ()) { total_rows++; xht_stats.add ((inT16) floor (row->xheight + 0.5), 1); blob_it.set_to_list (row->blob_list ()); start_of_row = blob_it.data ()->bounding_box ().left (); end_of_row = blob_it.data_relative (-1)->bounding_box ().right (); if (min_left > start_of_row) min_left = start_of_row; if (max_right < end_of_row) max_right = end_of_row; } } if ((total_rows < 3) || (min_left >= max_right)) { total_rows = 0; min_left = max_right = 0; return; } bucket_size = (inT16) floor (xht_stats.median () + 0.5) / 2; map_max = (max_right - min_left) / bucket_size; map = (inT16 *) alloc_mem ((map_max + 1) * sizeof (inT16)); for (i = 0; i <= map_max; i++) map[i] = 0; for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); if (!row->blob_list ()->empty ()) { blob_it.set_to_list (row->blob_list ()); blob_it.mark_cycle_pt (); blob_box = box_next (&blob_it); prev_blob_box = blob_box; if (gapmap_use_ends) { /* Leading space */ gap_width = blob_box.left () - min_left; if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) { max_quantum = (blob_box.left () - min_left) / bucket_size; for (i = 0; i <= max_quantum; i++) map[i]++; } } while (!blob_it.cycled_list ()) { blob_box = box_next (&blob_it); gap_width = blob_box.left () - prev_blob_box.right (); if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) { min_quantum = (prev_blob_box.right () - min_left) / bucket_size; max_quantum = (blob_box.left () - min_left) / bucket_size; for (i = min_quantum; i <= max_quantum; i++) map[i]++; } prev_blob_box = blob_box; } if (gapmap_use_ends) { /* Trailing space */ gap_width = max_right - prev_blob_box.right (); if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) { min_quantum = (prev_blob_box.right () - min_left) / bucket_size; for (i = min_quantum; i <= map_max; i++) map[i]++; } } } } for (i = 0; i <= map_max; i++) { if (map[i] > total_rows / 2) { if (gapmap_no_isolated_quanta && (((i == 0) && (map[i + 1] <= total_rows / 2)) || ((i == map_max) && (map[i - 1] <= total_rows / 2)) || ((i > 0) && (i < map_max) && (map[i - 1] <= total_rows / 2) && (map[i + 1] <= total_rows / 2)))) { map[i] = 0; //prevent isolated quantum } else any_tabs = TRUE; } } if (gapmap_debug && any_tabs) tprintf ("Table found\n"); }