void BLOCK::compress() { // squash it up #define ROW_SPACING 5 ROW_IT row_it(&rows); ROW *row; ICOORD row_spacing (0, ROW_SPACING); ICOORDELT_IT icoordelt_it; sort_rows(); box = TBOX (box.topleft (), box.topleft ()); box.move_bottom_edge (ROW_SPACING); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); row->move (box.botleft () - row_spacing - row->bounding_box ().topleft ()); box += row->bounding_box (); } leftside.clear (); icoordelt_it.set_to_list (&leftside); icoordelt_it.add_to_end (new ICOORDELT (box.left (), box.bottom ())); icoordelt_it.add_to_end (new ICOORDELT (box.left (), box.top ())); rightside.clear (); icoordelt_it.set_to_list (&rightside); icoordelt_it.add_to_end (new ICOORDELT (box.right (), box.bottom ())); icoordelt_it.add_to_end (new ICOORDELT (box.right (), box.top ())); }
// This method resolves the cc bbox to a particular row and returns the row's // xheight. int ShiroRekhaSplitter::GetXheightForCC(Box* cc_bbox) { if (!segmentation_block_list_) { return global_xheight_; } // Compute the box coordinates in Tesseract's coordinate system. TBOX bbox(cc_bbox->x, pixGetHeight(orig_pix_) - cc_bbox->y - cc_bbox->h - 1, cc_bbox->x + cc_bbox->w, pixGetHeight(orig_pix_) - cc_bbox->y - 1); // Iterate over all blocks. BLOCK_IT block_it(segmentation_block_list_); for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { BLOCK* block = block_it.data(); // Iterate over all rows in the block. ROW_IT row_it(block->row_list()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { ROW* row = row_it.data(); if (!row->bounding_box().major_overlap(bbox)) { continue; } // Row could be skewed, warped, etc. Use the position of the box to // determine the baseline position of the row for that x-coordinate. // Create a square TBOX whose baseline's mid-point lies at this point // and side is row's xheight. Take the overlap of this box with the input // box and check if it is a 'major overlap'. If so, this box lies in this // row. In that case, return the xheight for this row. float box_middle = 0.5 * (bbox.left() + bbox.right()); int baseline = static_cast<int>(row->base_line(box_middle) + 0.5); TBOX test_box(box_middle - row->x_height() / 2, baseline, box_middle + row->x_height() / 2, static_cast<int>(baseline + row->x_height())); // Compute overlap. If it is is a major overlap, this is the right row. if (bbox.major_overlap(test_box)) { return row->x_height(); } } } // No row found for this bbox. return kUnspecifiedXheight; }
/** * Returns the baseline of the current object at the given level. * The baseline is the line that passes through (x1, y1) and (x2, y2). * WARNING: with vertical text, baselines may be vertical! */ bool PageIterator::Baseline(PageIteratorLevel level, int* x1, int* y1, int* x2, int* y2) const { if (it_->word() == NULL) return false; // Already at the end! ROW* row = it_->row()->row; WERD* word = it_->word()->word; TBOX box = (level == RIL_WORD || level == RIL_SYMBOL) ? word->bounding_box() : row->bounding_box(); int left = box.left(); ICOORD startpt(left, static_cast<inT16>(row->base_line(left) + 0.5)); int right = box.right(); ICOORD endpt(right, static_cast<inT16>(row->base_line(right) + 0.5)); // Rotate to image coordinates and convert to global image coords. startpt.rotate(it_->block()->block->re_rotation()); endpt.rotate(it_->block()->block->re_rotation()); *x1 = startpt.x() / scale_ + rect_left_; *y1 = (rect_height_ - startpt.y()) / scale_ + rect_top_; *x2 = endpt.x() / scale_ + rect_left_; *y2 = (rect_height_ - endpt.y()) / scale_ + rect_top_; return true; }
// Fixes the block so it obeys all the rules: // Must have at least one ROW. // Must have at least one WERD. // WERDs contain a fake blob. void Textord::cleanup_nontext_block(BLOCK* block) { // Non-text blocks must contain at least one row. ROW_IT row_it(block->row_list()); if (row_it.empty()) { const TBOX& box = block->pdblk.bounding_box(); float height = box.height(); int32_t xstarts[2] = {box.left(), box.right()}; double coeffs[3] = {0.0, 0.0, static_cast<double>(box.bottom())}; ROW* row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f, height / 4.0f, 0, 1); row_it.add_after_then_move(row); } // Each row must contain at least one word. for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { ROW* row = row_it.data(); WERD_IT w_it(row->word_list()); if (w_it.empty()) { // Make a fake blob to put in the word. TBOX box = block->row_list()->singleton() ? block->pdblk.bounding_box() : row->bounding_box(); C_BLOB* blob = C_BLOB::FakeBlob(box); C_BLOB_LIST blobs; C_BLOB_IT blob_it(&blobs); blob_it.add_after_then_move(blob); WERD* word = new WERD(&blobs, 0, nullptr); w_it.add_after_then_move(word); } // Each word must contain a fake blob. for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { WERD* word = w_it.data(); // Just assert that this is true, as it would be useful to find // out why it isn't. ASSERT_HOST(!word->cblob_list()->empty()); } row->recalc_bounding_box(); } }
ROW *find_row_of_box( // BLOCK_LIST *block_list, //real blocks BOX box, //from boxfile INT16 &block_id, INT16 &row_id_to_process) { BLOCK_IT block_it(block_list); BLOCK *block; ROW_IT row_it; ROW *row; ROW *row_to_process = NULL; INT16 row_id; WERD_IT word_it; WERD *word; BOOL8 polyg; PBLOB_IT blob_it; PBLOB *blob; OUTLINE_IT outline_it; OUTLINE *outline; /* Find row to process - error if box REALLY overlaps more than one row. (I.e it overlaps blobs in the row - not just overlaps the bounding box of the whole row.) */ block_id = 0; for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { block_id++; row_id = 0; block = block_it.data (); if (block->bounding_box ().overlap (box)) { row_it.set_to_list (block->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row_id++; row = row_it.data (); if (row->bounding_box ().overlap (box)) { word_it.set_to_list (row->word_list ()); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); polyg = word->flag (W_POLYGON); if (word->bounding_box ().overlap (box)) { blob_it.set_to_list (word->gblob_list ()); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (gblob_bounding_box (blob, polyg). overlap (box)) { outline_it. set_to_list (gblob_out_list (blob, polyg)); for (outline_it.mark_cycle_pt (); !outline_it.cycled_list (); outline_it.forward ()) { outline = outline_it.data (); if (goutline_bounding_box (outline, polyg).major_overlap (box)) { if ((row_to_process == NULL) || (row_to_process == row)) { row_to_process = row; row_id_to_process = row_id; } else /* RETURN ERROR Box overlaps blobs in more than one row */ return NULL; } } } } } } } } } } return row_to_process; }
// Compute the distance from the left and right ends of each row to the // left and right edges of the block's polyblock. Illustration: // ____________________________ _______________________ // | Howdy neighbor! | |rectangular blocks look| // | This text is written to| |more like stacked pizza| // |illustrate how useful poly- |boxes. | // |blobs are in ----------- ------ The polyblob| // |dealing with| _________ |for a BLOCK rec-| // |harder layout| /===========\ |ords the possibly| // |issues. | | _ _ | |skewed pseudo-| // | You see this| | |_| \|_| | |rectangular | // |text is flowed| | } | |boundary that| // |around a mid-| \ ____ | |forms the ideal-| // |cloumn portrait._____ \ / __|ized text margin| // | Polyblobs exist| \ / |from which we should| // |to account for insets| | | |measure paragraph| // |which make otherwise| ----- |indentation. | // ----------------------- ---------------------- // // If we identify a drop-cap, we measure the left margin for the lines // below the first line relative to one space past the drop cap. The // first line's margin and those past the drop cap area are measured // relative to the enclosing polyblock. // // TODO(rays): Before this will work well, we'll need to adjust the // polyblob tighter around the text near images, as in: // UNLV_AUTO:mag.3G0 page 2 // UNLV_AUTO:mag.3G4 page 16 void BLOCK::compute_row_margins() { if (row_list()->empty() || row_list()->singleton()) { return; } // If Layout analysis was not called, default to this. POLY_BLOCK rect_block(bounding_box(), PT_FLOWING_TEXT); POLY_BLOCK *pblock = &rect_block; if (poly_block() != NULL) { pblock = poly_block(); } // Step One: Determine if there is a drop-cap. // TODO(eger): Fix up drop cap code for RTL languages. ROW_IT r_it(row_list()); ROW *first_row = r_it.data(); ROW *second_row = r_it.data_relative(1); // initialize the bottom of a fictitious drop cap far above the first line. int drop_cap_bottom = first_row->bounding_box().top() + first_row->bounding_box().height(); int drop_cap_right = first_row->bounding_box().left(); int mid_second_line = second_row->bounding_box().top() - second_row->bounding_box().height() / 2; WERD_IT werd_it(r_it.data()->word_list()); // words of line one if (!werd_it.empty()) { C_BLOB_IT cblob_it(werd_it.data()->cblob_list()); for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward()) { TBOX bbox = cblob_it.data()->bounding_box(); if (bbox.bottom() <= mid_second_line) { // we found a real drop cap first_row->set_has_drop_cap(true); if (drop_cap_bottom > bbox.bottom()) drop_cap_bottom = bbox.bottom(); if (drop_cap_right < bbox.right()) drop_cap_right = bbox.right(); } } } // Step Two: Calculate the margin from the text of each row to the block // (or drop-cap) boundaries. PB_LINE_IT lines(pblock); r_it.set_to_list(row_list()); for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) { ROW *row = r_it.data(); TBOX row_box = row->bounding_box(); int left_y = row->base_line(row_box.left()) + row->x_height(); int left_margin; ICOORDELT_LIST *segments = lines.get_line(left_y); LeftMargin(segments, row_box.left(), &left_margin); delete segments; if (row_box.top() >= drop_cap_bottom) { int drop_cap_distance = row_box.left() - row->space() - drop_cap_right; if (drop_cap_distance < 0) drop_cap_distance = 0; if (drop_cap_distance < left_margin) left_margin = drop_cap_distance; } int right_y = row->base_line(row_box.right()) + row->x_height(); int right_margin; segments = lines.get_line(right_y); RightMargin(segments, row_box.right(), &right_margin); delete segments; row->set_lmargin(left_margin); row->set_rmargin(right_margin); } }
/// Consume all source blobs that strongly overlap the given box, /// putting them into a new word, with the correct_text label. /// Fights over which box owns which blobs are settled by /// applying the blobs to box or next_box with the least non-overlap. /// @return false if the box was in error, which can only be caused by /// failing to find an overlapping blob for a box. bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list, const TBOX& box, const TBOX& next_box, const char* correct_text) { if (applybox_debug > 1) { tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text); } WERD* new_word = NULL; BLOCK_IT b_it(block_list); for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { BLOCK* block = b_it.data(); if (!box.major_overlap(block->bounding_box())) continue; ROW_IT r_it(block->row_list()); for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) { ROW* row = r_it.data(); if (!box.major_overlap(row->bounding_box())) continue; WERD_IT w_it(row->word_list()); for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { WERD* word = w_it.data(); if (applybox_debug > 2) { tprintf("Checking word:"); word->bounding_box().print(); } if (word->text() != NULL && word->text()[0] != '\0') continue; // Ignore words that are already done. if (!box.major_overlap(word->bounding_box())) continue; C_BLOB_IT blob_it(word->cblob_list()); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* blob = blob_it.data(); TBOX blob_box = blob->bounding_box(); if (!blob_box.major_overlap(box)) continue; double current_box_miss_metric = BoxMissMetric(blob_box, box); double next_box_miss_metric = BoxMissMetric(blob_box, next_box); if (applybox_debug > 2) { tprintf("Checking blob:"); blob_box.print(); tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric, next_box_miss_metric); } if (current_box_miss_metric > next_box_miss_metric) continue; // Blob is a better match for next box. if (applybox_debug > 2) { tprintf("Blob match: blob:"); blob_box.print(); tprintf("Matches box:"); box.print(); tprintf("With next box:"); next_box.print(); } if (new_word == NULL) { // Make a new word with a single blob. new_word = word->shallow_copy(); new_word->set_text(correct_text); w_it.add_to_end(new_word); } C_BLOB_IT new_blob_it(new_word->cblob_list()); new_blob_it.add_to_end(blob_it.extract()); } } } } if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n"); return new_word != NULL; }