Пример #1
0
void BLOCK::compress() {  // squash it up
  #define           ROW_SPACING 5

  ROW_IT row_it(&rows);
  ROW *row;
  ICOORD row_spacing (0, ROW_SPACING);

  ICOORDELT_IT icoordelt_it;

  sort_rows();

  box = TBOX (box.topleft (), box.topleft ());
  box.move_bottom_edge (ROW_SPACING);
  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
    row = row_it.data ();
    row->move (box.botleft () - row_spacing -
      row->bounding_box ().topleft ());
    box += row->bounding_box ();
  }

  leftside.clear ();
  icoordelt_it.set_to_list (&leftside);
  icoordelt_it.add_to_end (new ICOORDELT (box.left (), box.bottom ()));
  icoordelt_it.add_to_end (new ICOORDELT (box.left (), box.top ()));
  rightside.clear ();
  icoordelt_it.set_to_list (&rightside);
  icoordelt_it.add_to_end (new ICOORDELT (box.right (), box.bottom ()));
  icoordelt_it.add_to_end (new ICOORDELT (box.right (), box.top ()));
}
// This method resolves the cc bbox to a particular row and returns the row's
// xheight.
int ShiroRekhaSplitter::GetXheightForCC(Box* cc_bbox) {
  if (!segmentation_block_list_) {
    return global_xheight_;
  }
  // Compute the box coordinates in Tesseract's coordinate system.
  TBOX bbox(cc_bbox->x,
            pixGetHeight(orig_pix_) - cc_bbox->y - cc_bbox->h - 1,
            cc_bbox->x + cc_bbox->w,
            pixGetHeight(orig_pix_) - cc_bbox->y - 1);
  // Iterate over all blocks.
  BLOCK_IT block_it(segmentation_block_list_);
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
    BLOCK* block = block_it.data();
    // Iterate over all rows in the block.
    ROW_IT row_it(block->row_list());
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
      ROW* row = row_it.data();
      if (!row->bounding_box().major_overlap(bbox)) {
        continue;
      }
      // Row could be skewed, warped, etc. Use the position of the box to
      // determine the baseline position of the row for that x-coordinate.
      // Create a square TBOX whose baseline's mid-point lies at this point
      // and side is row's xheight. Take the overlap of this box with the input
      // box and check if it is a 'major overlap'. If so, this box lies in this
      // row. In that case, return the xheight for this row.
      float box_middle = 0.5 * (bbox.left() + bbox.right());
      int baseline = static_cast<int>(row->base_line(box_middle) + 0.5);
      TBOX test_box(box_middle - row->x_height() / 2,
                    baseline,
                    box_middle + row->x_height() / 2,
                    static_cast<int>(baseline + row->x_height()));
      // Compute overlap. If it is is a major overlap, this is the right row.
      if (bbox.major_overlap(test_box)) {
        return row->x_height();
      }
    }
  }
  // No row found for this bbox.
  return kUnspecifiedXheight;
}
Пример #3
0
/**
 * Returns the baseline of the current object at the given level.
 * The baseline is the line that passes through (x1, y1) and (x2, y2).
 * WARNING: with vertical text, baselines may be vertical!
 */
bool PageIterator::Baseline(PageIteratorLevel level,
                            int* x1, int* y1, int* x2, int* y2) const {
  if (it_->word() == NULL) return false;  // Already at the end!
  ROW* row = it_->row()->row;
  WERD* word = it_->word()->word;
  TBOX box = (level == RIL_WORD || level == RIL_SYMBOL)
           ? word->bounding_box()
           : row->bounding_box();
  int left = box.left();
  ICOORD startpt(left, static_cast<inT16>(row->base_line(left) + 0.5));
  int right = box.right();
  ICOORD endpt(right, static_cast<inT16>(row->base_line(right) + 0.5));
  // Rotate to image coordinates and convert to global image coords.
  startpt.rotate(it_->block()->block->re_rotation());
  endpt.rotate(it_->block()->block->re_rotation());
  *x1 = startpt.x() / scale_ + rect_left_;
  *y1 = (rect_height_ - startpt.y()) / scale_ + rect_top_;
  *x2 = endpt.x() / scale_ + rect_left_;
  *y2 = (rect_height_ - endpt.y()) / scale_ + rect_top_;
  return true;
}
Пример #4
0
// Fixes the block so it obeys all the rules:
// Must have at least one ROW.
// Must have at least one WERD.
// WERDs contain a fake blob.
void Textord::cleanup_nontext_block(BLOCK* block) {
  // Non-text blocks must contain at least one row.
  ROW_IT row_it(block->row_list());
  if (row_it.empty()) {
    const TBOX& box = block->pdblk.bounding_box();
    float height = box.height();
    int32_t xstarts[2] = {box.left(), box.right()};
    double coeffs[3] = {0.0, 0.0, static_cast<double>(box.bottom())};
    ROW* row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f,
                       height / 4.0f, 0, 1);
    row_it.add_after_then_move(row);
  }
  // Each row must contain at least one word.
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
    ROW* row = row_it.data();
    WERD_IT w_it(row->word_list());
    if (w_it.empty()) {
      // Make a fake blob to put in the word.
      TBOX box = block->row_list()->singleton() ? block->pdblk.bounding_box()
                                                : row->bounding_box();
      C_BLOB* blob = C_BLOB::FakeBlob(box);
      C_BLOB_LIST blobs;
      C_BLOB_IT blob_it(&blobs);
      blob_it.add_after_then_move(blob);
      WERD* word = new WERD(&blobs, 0, nullptr);
      w_it.add_after_then_move(word);
    }
    // Each word must contain a fake blob.
    for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
      WERD* word = w_it.data();
      // Just assert that this is true, as it would be useful to find
      // out why it isn't.
      ASSERT_HOST(!word->cblob_list()->empty());
    }
    row->recalc_bounding_box();
  }
}
Пример #5
0
ROW *find_row_of_box(                         //
                     BLOCK_LIST *block_list,  //real blocks
                     BOX box,                 //from boxfile
                     INT16 &block_id,
                     INT16 &row_id_to_process) {
  BLOCK_IT block_it(block_list);
  BLOCK *block;
  ROW_IT row_it;
  ROW *row;
  ROW *row_to_process = NULL;
  INT16 row_id;
  WERD_IT word_it;
  WERD *word;
  BOOL8 polyg;
  PBLOB_IT blob_it;
  PBLOB *blob;
  OUTLINE_IT outline_it;
  OUTLINE *outline;

  /*
    Find row to process - error if box REALLY overlaps more than one row. (I.e
    it overlaps blobs in the row - not just overlaps the bounding box of the
    whole row.)
  */

  block_id = 0;
  for (block_it.mark_cycle_pt ();
  !block_it.cycled_list (); block_it.forward ()) {
    block_id++;
    row_id = 0;
    block = block_it.data ();
    if (block->bounding_box ().overlap (box)) {
      row_it.set_to_list (block->row_list ());
      for (row_it.mark_cycle_pt ();
      !row_it.cycled_list (); row_it.forward ()) {
        row_id++;
        row = row_it.data ();
        if (row->bounding_box ().overlap (box)) {
          word_it.set_to_list (row->word_list ());
          for (word_it.mark_cycle_pt ();
          !word_it.cycled_list (); word_it.forward ()) {
            word = word_it.data ();
            polyg = word->flag (W_POLYGON);
            if (word->bounding_box ().overlap (box)) {
              blob_it.set_to_list (word->gblob_list ());
              for (blob_it.mark_cycle_pt ();
              !blob_it.cycled_list (); blob_it.forward ()) {
                blob = blob_it.data ();
                if (gblob_bounding_box (blob, polyg).
                overlap (box)) {
                  outline_it.
                    set_to_list (gblob_out_list
                    (blob, polyg));
                  for (outline_it.mark_cycle_pt ();
                    !outline_it.cycled_list ();
                  outline_it.forward ()) {
                    outline = outline_it.data ();
                    if (goutline_bounding_box
                    (outline, polyg).major_overlap (box)) {
                      if ((row_to_process == NULL) ||
                      (row_to_process == row)) {
                        row_to_process = row;
                        row_id_to_process = row_id;
                      }
                      else
                        /* RETURN ERROR Box overlaps blobs in more than one row  */
                        return NULL;
                    }
                  }
                }
              }
            }
          }
        }
      }
    }
  }
  return row_to_process;
}
Пример #6
0
// Compute the distance from the left and right ends of each row to the
// left and right edges of the block's polyblock.  Illustration:
//  ____________________________   _______________________
//  |  Howdy neighbor!         |  |rectangular blocks look|
//  |  This text is  written to|  |more like stacked pizza|
//  |illustrate how useful poly-  |boxes.                 |
//  |blobs  are   in -----------  ------   The    polyblob|
//  |dealing    with|     _________     |for a BLOCK  rec-|
//  |harder   layout|   /===========\   |ords the possibly|
//  |issues.        |    |  _    _  |   |skewed    pseudo-|
//  |  You  see this|    | |_| \|_| |   |rectangular      |
//  |text is  flowed|    |      }   |   |boundary     that|
//  |around  a  mid-|     \   ____  |   |forms the  ideal-|
//  |cloumn portrait._____ \       /  __|ized  text margin|
//  |  Polyblobs     exist| \    /   |from which we should|
//  |to account for insets|  |   |   |measure    paragraph|
//  |which make  otherwise|  -----   |indentation.        |
//  -----------------------          ----------------------
//
// If we identify a drop-cap, we measure the left margin for the lines
// below the first line relative to one space past the drop cap.  The
// first line's margin and those past the drop cap area are measured
// relative to the enclosing polyblock.
//
// TODO(rays): Before this will work well, we'll need to adjust the
//             polyblob tighter around the text near images, as in:
//             UNLV_AUTO:mag.3G0  page 2
//             UNLV_AUTO:mag.3G4  page 16
void BLOCK::compute_row_margins() {
  if (row_list()->empty() || row_list()->singleton()) {
    return;
  }

  // If Layout analysis was not called, default to this.
  POLY_BLOCK rect_block(bounding_box(), PT_FLOWING_TEXT);
  POLY_BLOCK *pblock = &rect_block;
  if (poly_block() != NULL) {
    pblock = poly_block();
  }

  // Step One: Determine if there is a drop-cap.
  //           TODO(eger): Fix up drop cap code for RTL languages.
  ROW_IT r_it(row_list());
  ROW *first_row = r_it.data();
  ROW *second_row = r_it.data_relative(1);

  // initialize the bottom of a fictitious drop cap far above the first line.
  int drop_cap_bottom = first_row->bounding_box().top() +
                        first_row->bounding_box().height();
  int drop_cap_right = first_row->bounding_box().left();
  int mid_second_line = second_row->bounding_box().top() -
                        second_row->bounding_box().height() / 2;
  WERD_IT werd_it(r_it.data()->word_list());  // words of line one
  if (!werd_it.empty()) {
    C_BLOB_IT cblob_it(werd_it.data()->cblob_list());
    for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list();
         cblob_it.forward()) {
      TBOX bbox = cblob_it.data()->bounding_box();
      if (bbox.bottom() <= mid_second_line) {
        // we found a real drop cap
        first_row->set_has_drop_cap(true);
        if (drop_cap_bottom >  bbox.bottom())
          drop_cap_bottom = bbox.bottom();
        if (drop_cap_right < bbox.right())
          drop_cap_right = bbox.right();
      }
    }
  }

  // Step Two: Calculate the margin from the text of each row to the block
  //           (or drop-cap) boundaries.
  PB_LINE_IT lines(pblock);
  r_it.set_to_list(row_list());
  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
    ROW *row = r_it.data();
    TBOX row_box = row->bounding_box();
    int left_y = row->base_line(row_box.left()) + row->x_height();
    int left_margin;
    ICOORDELT_LIST *segments = lines.get_line(left_y);
    LeftMargin(segments, row_box.left(), &left_margin);
    delete segments;

    if (row_box.top() >= drop_cap_bottom) {
      int drop_cap_distance = row_box.left() - row->space() - drop_cap_right;
      if (drop_cap_distance < 0)
        drop_cap_distance = 0;
      if (drop_cap_distance < left_margin)
        left_margin = drop_cap_distance;
    }

    int right_y = row->base_line(row_box.right()) + row->x_height();
    int right_margin;
    segments = lines.get_line(right_y);
    RightMargin(segments, row_box.right(), &right_margin);
    delete segments;
    row->set_lmargin(left_margin);
    row->set_rmargin(right_margin);
  }
}
Пример #7
0
/// Consume all source blobs that strongly overlap the given box,
/// putting them into a new word, with the correct_text label.
/// Fights over which box owns which blobs are settled by
/// applying the blobs to box or next_box with the least non-overlap.
/// @return false if the box was in error, which can only be caused by
/// failing to find an overlapping blob for a box.
bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
                                 const TBOX& box, const TBOX& next_box,
                                 const char* correct_text) {
  if (applybox_debug > 1) {
    tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
  }
  WERD* new_word = NULL;
  BLOCK_IT b_it(block_list);
  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
    BLOCK* block = b_it.data();
    if (!box.major_overlap(block->bounding_box()))
      continue;
    ROW_IT r_it(block->row_list());
    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
      ROW* row = r_it.data();
      if (!box.major_overlap(row->bounding_box()))
        continue;
      WERD_IT w_it(row->word_list());
      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
        WERD* word = w_it.data();
        if (applybox_debug > 2) {
          tprintf("Checking word:");
          word->bounding_box().print();
        }
        if (word->text() != NULL && word->text()[0] != '\0')
          continue;  // Ignore words that are already done.
        if (!box.major_overlap(word->bounding_box()))
          continue;
        C_BLOB_IT blob_it(word->cblob_list());
        for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
             blob_it.forward()) {
          C_BLOB* blob = blob_it.data();
          TBOX blob_box = blob->bounding_box();
          if (!blob_box.major_overlap(box))
            continue;
          double current_box_miss_metric = BoxMissMetric(blob_box, box);
          double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
          if (applybox_debug > 2) {
            tprintf("Checking blob:");
            blob_box.print();
            tprintf("Current miss metric = %g, next = %g\n",
                    current_box_miss_metric, next_box_miss_metric);
          }
          if (current_box_miss_metric > next_box_miss_metric)
            continue;  // Blob is a better match for next box.
          if (applybox_debug > 2) {
            tprintf("Blob match: blob:");
            blob_box.print();
            tprintf("Matches box:");
            box.print();
            tprintf("With next box:");
            next_box.print();
          }
          if (new_word == NULL) {
            // Make a new word with a single blob.
            new_word = word->shallow_copy();
            new_word->set_text(correct_text);
            w_it.add_to_end(new_word);
          }
          C_BLOB_IT new_blob_it(new_word->cblob_list());
          new_blob_it.add_to_end(blob_it.extract());
        }
      }
    }
  }
  if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n");
  return new_word != NULL;
}