void BLOCK::compress() { // squash it up #define ROW_SPACING 5 ROW_IT row_it(&rows); ROW *row; ICOORD row_spacing (0, ROW_SPACING); ICOORDELT_IT icoordelt_it; sort_rows(); box = TBOX (box.topleft (), box.topleft ()); box.move_bottom_edge (ROW_SPACING); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); row->move (box.botleft () - row_spacing - row->bounding_box ().topleft ()); box += row->bounding_box (); } leftside.clear (); icoordelt_it.set_to_list (&leftside); icoordelt_it.add_to_end (new ICOORDELT (box.left (), box.bottom ())); icoordelt_it.add_to_end (new ICOORDELT (box.left (), box.top ())); rightside.clear (); icoordelt_it.set_to_list (&rightside); icoordelt_it.add_to_end (new ICOORDELT (box.right (), box.bottom ())); icoordelt_it.add_to_end (new ICOORDELT (box.right (), box.top ())); }
void PrintSegmentationStats(BLOCK_LIST* block_list) { int num_blocks = 0; int num_rows = 0; int num_words = 0; int num_blobs = 0; BLOCK_IT block_it(block_list); for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { BLOCK* block = block_it.data(); ++num_blocks; ROW_IT row_it(block->row_list()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { ++num_rows; ROW* row = row_it.data(); // Iterate over all werds in the row. WERD_IT werd_it(row->word_list()); for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) { WERD* werd = werd_it.data(); ++num_words; num_blobs += werd->cblob_list()->length(); } } } tprintf("Block list stats:\nBlocks = %d\nRows = %d\nWords = %d\nBlobs = %d\n", num_blocks, num_rows, num_words, num_blobs); }
void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST* real_rows) { TO_ROW_IT to_row_it(rows); TO_ROW* row = to_row_it.data(); // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready // to create the word. C_BLOB_LIST cblobs; C_BLOB_IT cblob_it(&cblobs); BLOBNBOX_IT box_it(row->blob_list()); for (;!box_it.empty(); box_it.forward()) { BLOBNBOX* bblob= box_it.extract(); if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) { if (bblob->cblob() != NULL) { C_OUTLINE_IT cout_it(cblob_it.data()->out_list()); cout_it.move_to_last(); cout_it.add_list_after(bblob->cblob()->out_list()); delete bblob->cblob(); } } else { if (bblob->cblob() != NULL) cblob_it.add_after_then_move(bblob->cblob()); delete bblob; } } // Convert the TO_ROW to a ROW. ROW* real_row = new ROW(row, static_cast<inT16>(row->kern_size), static_cast<inT16>(row->space_size)); WERD_IT word_it(real_row->word_list()); WERD* word = new WERD(&cblobs, 0, NULL); word->set_flag(W_BOL, TRUE); word->set_flag(W_EOL, TRUE); word_it.add_after_then_move(word); ROW_IT row_it(real_rows); row_it.add_after_then_move(real_row); }
//yangjing01 modified : bool TAL_make_single_word(bool one_blob, TO_ROW_LIST* rows, ROW_LIST* real_rows) { TO_ROW_IT to_row_it(rows); ROW_IT row_it(real_rows); //to_real_row is the real row information of single row or single char mode TO_ROW* real_to_row = NULL; float row_max_height = 0.0; for (to_row_it.mark_cycle_pt(); !to_row_it.cycled_list(); to_row_it.forward()){ TO_ROW* row = to_row_it.data(); float row_min_y = row->min_y(); float row_max_y = row->max_y(); float row_height = abs(row_max_y - row_min_y); if (real_to_row == NULL || row_height > row_max_height || fabs(row_height - row_max_height) < 1.0f){ row_max_height = row_height; real_to_row = row; } } if (real_to_row == NULL){ return false; } C_BLOB_LIST cblobs; C_BLOB_IT cblob_it(&cblobs); BLOBNBOX_IT box_it(real_to_row->blob_list()); for (; !box_it.empty(); box_it.forward()){ BLOBNBOX* bblob = box_it.extract(); if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) { if (bblob->cblob() != NULL){ C_OUTLINE_IT cout_it(cblob_it.data()->out_list()); cout_it.move_to_last(); cout_it.add_list_after(bblob->cblob()->out_list()); delete bblob->cblob(); } } else { if (bblob->cblob() != NULL) cblob_it.add_after_then_move(bblob->cblob()); } delete bblob; } // Convert the TO_ROW to a ROW. ROW* real_row = new ROW(real_to_row, static_cast<inT16>(real_to_row->kern_size), static_cast<inT16>(real_to_row->space_size)); WERD_IT word_it(real_row->word_list()); WERD* word = new WERD(&cblobs, 0, NULL); word->set_flag(W_BOL, TRUE); word->set_flag(W_EOL, TRUE); word->set_flag(W_DONT_CHOP, one_blob); word_it.add_after_then_move(word); row_it.add_after_then_move(real_row); return true; }
BaselineBlock::BaselineBlock(int debug_level, bool non_text, TO_BLOCK* block) : block_(block), debug_level_(debug_level), non_text_block_(non_text), good_skew_angle_(false), skew_angle_(0.0), line_spacing_(block->line_spacing), line_offset_(0.0), model_error_(0.0) { TO_ROW_IT row_it(block_->get_rows()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { // Sort the blobs on the rows. row_it.data()->blob_list()->sort(blob_x_order); rows_.push_back(new BaselineRow(block->line_spacing, row_it.data())); } }
// Helper computes median xheight in the image. static double MedianXHeight(BLOCK_LIST *block_list) { BLOCK_IT block_it(block_list); STATS xheights(0, block_it.data()->bounding_box().height()); for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { ROW_IT row_it(block_it.data()->row_list()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { xheights.add(IntCastRounded(row_it.data()->x_height()), 1); } } return xheights.median(); }
static void clear_any_old_text(BLOCK_LIST *block_list) { BLOCK_IT block_it(block_list); for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { ROW_IT row_it(block_it.data()->row_list()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { WERD_IT word_it(row_it.data()->word_list()); for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { word_it.data()->set_text(""); } } } }
// Sets the parameters in TO_BLOCK that are needed by subsequent processes. void BaselineBlock::SetupBlockParameters() const { if (line_spacing_ > 0.0) { // Where was block_line_spacing set before? float min_spacing = MIN(block_->line_spacing, line_spacing_); if (min_spacing < block_->line_size) block_->line_size = min_spacing; block_->line_spacing = line_spacing_; block_->baseline_offset = line_offset_; block_->max_blob_size = line_spacing_ * kMaxBlobSizeMultiple; } // Setup the parameters on all the rows. TO_ROW_IT row_it(block_->get_rows()); for (int r = 0; r < rows_.size(); ++r, row_it.forward()) { BaselineRow* row = rows_[r]; TO_ROW* to_row = row_it.data(); row->SetupOldLineParameters(to_row); } }
// This method resolves the cc bbox to a particular row and returns the row's // xheight. int ShiroRekhaSplitter::GetXheightForCC(Box* cc_bbox) { if (!segmentation_block_list_) { return global_xheight_; } // Compute the box coordinates in Tesseract's coordinate system. TBOX bbox(cc_bbox->x, pixGetHeight(orig_pix_) - cc_bbox->y - cc_bbox->h - 1, cc_bbox->x + cc_bbox->w, pixGetHeight(orig_pix_) - cc_bbox->y - 1); // Iterate over all blocks. BLOCK_IT block_it(segmentation_block_list_); for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { BLOCK* block = block_it.data(); // Iterate over all rows in the block. ROW_IT row_it(block->row_list()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { ROW* row = row_it.data(); if (!row->bounding_box().major_overlap(bbox)) { continue; } // Row could be skewed, warped, etc. Use the position of the box to // determine the baseline position of the row for that x-coordinate. // Create a square TBOX whose baseline's mid-point lies at this point // and side is row's xheight. Take the overlap of this box with the input // box and check if it is a 'major overlap'. If so, this box lies in this // row. In that case, return the xheight for this row. float box_middle = 0.5 * (bbox.left() + bbox.right()); int baseline = static_cast<int>(row->base_line(box_middle) + 0.5); TBOX test_box(box_middle - row->x_height() / 2, baseline, box_middle + row->x_height() / 2, static_cast<int>(baseline + row->x_height())); // Compute overlap. If it is is a major overlap, this is the right row. if (bbox.major_overlap(test_box)) { return row->x_height(); } } } // No row found for this bbox. return kUnspecifiedXheight; }
void RefreshWordBlobsFromNewBlobs(BLOCK_LIST* block_list, C_BLOB_LIST* new_blobs, C_BLOB_LIST* not_found_blobs) { // Now iterate over all the blobs in the segmentation_block_list_, and just // replace the corresponding c-blobs inside the werds. BLOCK_IT block_it(block_list); for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { BLOCK* block = block_it.data(); if (block->poly_block() != NULL && !block->poly_block()->IsText()) continue; // Don't touch non-text blocks. // Iterate over all rows in the block. ROW_IT row_it(block->row_list()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { ROW* row = row_it.data(); // Iterate over all werds in the row. WERD_IT werd_it(row->word_list()); WERD_LIST new_words; WERD_IT new_words_it(&new_words); for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) { WERD* werd = werd_it.extract(); WERD* new_werd = werd->ConstructWerdWithNewBlobs(new_blobs, not_found_blobs); if (new_werd) { // Insert this new werd into the actual row's werd-list. Remove the // existing one. new_words_it.add_after_then_move(new_werd); delete werd; } else { // Reinsert the older word back, for lack of better options. // This is critical since dropping the words messes up segmentation: // eg. 1st word in the row might otherwise have W_FUZZY_NON turned on. new_words_it.add_after_then_move(werd); } } // Get rid of the old word list & replace it with the new one. row->word_list()->clear(); werd_it.move_to_first(); werd_it.add_list_after(&new_words); } } }
void ExtractBlobsFromSegmentation(BLOCK_LIST* blocks, C_BLOB_LIST* output_blob_list) { C_BLOB_IT return_list_it(output_blob_list); BLOCK_IT block_it(blocks); for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { BLOCK* block = block_it.data(); ROW_IT row_it(block->row_list()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { ROW* row = row_it.data(); // Iterate over all werds in the row. WERD_IT werd_it(row->word_list()); for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) { WERD* werd = werd_it.data(); return_list_it.move_to_last(); return_list_it.add_list_after(werd->cblob_list()); return_list_it.move_to_last(); return_list_it.add_list_after(werd->rej_cblob_list()); } } } }
// Fixes the block so it obeys all the rules: // Must have at least one ROW. // Must have at least one WERD. // WERDs contain a fake blob. void Textord::cleanup_nontext_block(BLOCK* block) { // Non-text blocks must contain at least one row. ROW_IT row_it(block->row_list()); if (row_it.empty()) { const TBOX& box = block->pdblk.bounding_box(); float height = box.height(); int32_t xstarts[2] = {box.left(), box.right()}; double coeffs[3] = {0.0, 0.0, static_cast<double>(box.bottom())}; ROW* row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f, height / 4.0f, 0, 1); row_it.add_after_then_move(row); } // Each row must contain at least one word. for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { ROW* row = row_it.data(); WERD_IT w_it(row->word_list()); if (w_it.empty()) { // Make a fake blob to put in the word. TBOX box = block->row_list()->singleton() ? block->pdblk.bounding_box() : row->bounding_box(); C_BLOB* blob = C_BLOB::FakeBlob(box); C_BLOB_LIST blobs; C_BLOB_IT blob_it(&blobs); blob_it.add_after_then_move(blob); WERD* word = new WERD(&blobs, 0, nullptr); w_it.add_after_then_move(word); } // Each word must contain a fake blob. for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { WERD* word = w_it.data(); // Just assert that this is true, as it would be useful to find // out why it isn't. ASSERT_HOST(!word->cblob_list()->empty()); } row->recalc_bounding_box(); } }
void BLOCK::sort_rows() { // order on "top" ROW_IT row_it(&rows); row_it.sort (decreasing_top_order); }
// Groups blocks by rotation, then, for each group, makes a WordGrid and calls // TransferDiacriticsToWords to copy the diacritic blobs to the most // appropriate words in the group of blocks. Source blobs are not touched. void Textord::TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs, BLOCK_LIST* blocks) { // Angle difference larger than this is too much to consider equal. // They should only be in multiples of M_PI/2 anyway. const double kMaxAngleDiff = 0.01; // About 0.6 degrees. PointerVector<BlockGroup> groups; BLOCK_IT bk_it(blocks); for (bk_it.mark_cycle_pt(); !bk_it.cycled_list(); bk_it.forward()) { BLOCK* block = bk_it.data(); if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) { continue; } // Linear search of the groups to find a matching rotation. float block_angle = block->re_rotation().angle(); int best_g = 0; float best_angle_diff = MAX_FLOAT32; for (int g = 0; g < groups.size(); ++g) { double angle_diff = fabs(block_angle - groups[g]->angle); if (angle_diff > M_PI) angle_diff = fabs(angle_diff - 2.0 * M_PI); if (angle_diff < best_angle_diff) { best_angle_diff = angle_diff; best_g = g; } } if (best_angle_diff > kMaxAngleDiff) { groups.push_back(new BlockGroup(block)); } else { groups[best_g]->blocks.push_back(block); groups[best_g]->bounding_box += block->pdblk.bounding_box(); float x_height = block->x_height(); if (x_height < groups[best_g]->min_xheight) groups[best_g]->min_xheight = x_height; } } // Now process each group of blocks. PointerVector<WordWithBox> word_ptrs; for (int g = 0; g < groups.size(); ++g) { const BlockGroup* group = groups[g]; if (group->bounding_box.null_box()) continue; WordGrid word_grid(group->min_xheight, group->bounding_box.botleft(), group->bounding_box.topright()); for (int b = 0; b < group->blocks.size(); ++b) { ROW_IT row_it(group->blocks[b]->row_list()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { ROW* row = row_it.data(); // Put the words of the row into the grid. WERD_IT w_it(row->word_list()); for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { WERD* word = w_it.data(); WordWithBox* box_word = new WordWithBox(word); word_grid.InsertBBox(true, true, box_word); // Save the pointer where it will be auto-deleted. word_ptrs.push_back(box_word); } } } FCOORD rotation = group->rotation; // Make it a forward rotation that will transform blob coords to block. rotation.set_y(-rotation.y()); TransferDiacriticsToWords(diacritic_blobs, rotation, &word_grid); } }
GAPMAP::GAPMAP( //Constructor TO_BLOCK *block //block ) { TO_ROW *row; //current row BLOBNBOX_IT blob_it; //iterator TBOX blob_box; TBOX prev_blob_box; int16_t gap_width; int16_t start_of_row; int16_t end_of_row; STATS xht_stats (0, 128); int16_t min_quantum; int16_t max_quantum; int16_t i; /* Find left and right extremes and bucket size */ map = nullptr; min_left = INT16_MAX; max_right = -INT16_MAX; total_rows = 0; any_tabs = false; // row iterator TO_ROW_IT row_it(block->get_rows()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); if (!row->blob_list ()->empty ()) { total_rows++; xht_stats.add (static_cast<int16_t>(floor (row->xheight + 0.5)), 1); blob_it.set_to_list (row->blob_list ()); start_of_row = blob_it.data ()->bounding_box ().left (); end_of_row = blob_it.data_relative (-1)->bounding_box ().right (); if (min_left > start_of_row) min_left = start_of_row; if (max_right < end_of_row) max_right = end_of_row; } } if ((total_rows < 3) || (min_left >= max_right)) { bucket_size = 0; map_max = 0; total_rows = 0; min_left = max_right = 0; return; } bucket_size = static_cast<int16_t>(floor (xht_stats.median () + 0.5)) / 2; map_max = (max_right - min_left) / bucket_size; map = new int16_t[map_max + 1]; for (i = 0; i <= map_max; i++) map[i] = 0; for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); if (!row->blob_list ()->empty ()) { blob_it.set_to_list (row->blob_list ()); blob_it.mark_cycle_pt (); blob_box = box_next (&blob_it); prev_blob_box = blob_box; if (gapmap_use_ends) { /* Leading space */ gap_width = blob_box.left () - min_left; if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) { max_quantum = (blob_box.left () - min_left) / bucket_size; if (max_quantum > map_max) max_quantum = map_max; for (i = 0; i <= max_quantum; i++) map[i]++; } } while (!blob_it.cycled_list ()) { blob_box = box_next (&blob_it); gap_width = blob_box.left () - prev_blob_box.right (); if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) { min_quantum = (prev_blob_box.right () - min_left) / bucket_size; max_quantum = (blob_box.left () - min_left) / bucket_size; if (max_quantum > map_max) max_quantum = map_max; for (i = min_quantum; i <= max_quantum; i++) map[i]++; } prev_blob_box = blob_box; } if (gapmap_use_ends) { /* Trailing space */ gap_width = max_right - prev_blob_box.right (); if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) { min_quantum = (prev_blob_box.right () - min_left) / bucket_size; if (min_quantum < 0) min_quantum = 0; for (i = min_quantum; i <= map_max; i++) map[i]++; } } } } for (i = 0; i <= map_max; i++) { if (map[i] > total_rows / 2) { if (gapmap_no_isolated_quanta && (((i == 0) && (map[i + 1] <= total_rows / 2)) || ((i == map_max) && (map[i - 1] <= total_rows / 2)) || ((i > 0) && (i < map_max) && (map[i - 1] <= total_rows / 2) && (map[i + 1] <= total_rows / 2)))) { map[i] = 0; //prevent isolated quantum } else any_tabs = true; } } if (gapmap_debug && any_tabs) tprintf ("Table found\n"); }