Ejemplo n.º 1
0
void PrintSegmentationStats(BLOCK_LIST* block_list) {
  int num_blocks = 0;
  int num_rows = 0;
  int num_words = 0;
  int num_blobs = 0;
  BLOCK_IT block_it(block_list);
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
    BLOCK* block = block_it.data();
    ++num_blocks;
    ROW_IT row_it(block->row_list());
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
      ++num_rows;
      ROW* row = row_it.data();
      // Iterate over all werds in the row.
      WERD_IT werd_it(row->word_list());
      for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) {
        WERD* werd = werd_it.data();
        ++num_words;
        num_blobs += werd->cblob_list()->length();
      }
    }
  }
  tprintf("Block list stats:\nBlocks = %d\nRows = %d\nWords = %d\nBlobs = %d\n",
          num_blocks, num_rows, num_words, num_blobs);
}
Ejemplo n.º 2
0
void RefreshWordBlobsFromNewBlobs(BLOCK_LIST* block_list,
                                  C_BLOB_LIST* new_blobs,
                                  C_BLOB_LIST* not_found_blobs) {
  // Now iterate over all the blobs in the segmentation_block_list_, and just
  // replace the corresponding c-blobs inside the werds.
  BLOCK_IT block_it(block_list);
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
    BLOCK* block = block_it.data();
    if (block->poly_block() != NULL && !block->poly_block()->IsText())
      continue;  // Don't touch non-text blocks.
    // Iterate over all rows in the block.
    ROW_IT row_it(block->row_list());
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
      ROW* row = row_it.data();
      // Iterate over all werds in the row.
      WERD_IT werd_it(row->word_list());
      WERD_LIST new_words;
      WERD_IT new_words_it(&new_words);
      for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) {
        WERD* werd = werd_it.extract();
        WERD* new_werd = werd->ConstructWerdWithNewBlobs(new_blobs,
                                                         not_found_blobs);
        if (new_werd) {
          // Insert this new werd into the actual row's werd-list. Remove the
          // existing one.
          new_words_it.add_after_then_move(new_werd);
          delete werd;
        } else {
          // Reinsert the older word back, for lack of better options.
          // This is critical since dropping the words messes up segmentation:
          // eg. 1st word in the row might otherwise have W_FUZZY_NON turned on.
          new_words_it.add_after_then_move(werd);
        }
      }
      // Get rid of the old word list & replace it with the new one.
      row->word_list()->clear();
      werd_it.move_to_first();
      werd_it.add_list_after(&new_words);
    }
  }
}
Ejemplo n.º 3
0
void ExtractBlobsFromSegmentation(BLOCK_LIST* blocks,
                                  C_BLOB_LIST* output_blob_list) {
  C_BLOB_IT return_list_it(output_blob_list);
  BLOCK_IT block_it(blocks);
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
    BLOCK* block = block_it.data();
    ROW_IT row_it(block->row_list());
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
      ROW* row = row_it.data();
      // Iterate over all werds in the row.
      WERD_IT werd_it(row->word_list());
      for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) {
        WERD* werd = werd_it.data();
        return_list_it.move_to_last();
        return_list_it.add_list_after(werd->cblob_list());
        return_list_it.move_to_last();
        return_list_it.add_list_after(werd->rej_cblob_list());
      }
    }
  }
}
Ejemplo n.º 4
0
// Compute the distance from the left and right ends of each row to the
// left and right edges of the block's polyblock.  Illustration:
//  ____________________________   _______________________
//  |  Howdy neighbor!         |  |rectangular blocks look|
//  |  This text is  written to|  |more like stacked pizza|
//  |illustrate how useful poly-  |boxes.                 |
//  |blobs  are   in -----------  ------   The    polyblob|
//  |dealing    with|     _________     |for a BLOCK  rec-|
//  |harder   layout|   /===========\   |ords the possibly|
//  |issues.        |    |  _    _  |   |skewed    pseudo-|
//  |  You  see this|    | |_| \|_| |   |rectangular      |
//  |text is  flowed|    |      }   |   |boundary     that|
//  |around  a  mid-|     \   ____  |   |forms the  ideal-|
//  |cloumn portrait._____ \       /  __|ized  text margin|
//  |  Polyblobs     exist| \    /   |from which we should|
//  |to account for insets|  |   |   |measure    paragraph|
//  |which make  otherwise|  -----   |indentation.        |
//  -----------------------          ----------------------
//
// If we identify a drop-cap, we measure the left margin for the lines
// below the first line relative to one space past the drop cap.  The
// first line's margin and those past the drop cap area are measured
// relative to the enclosing polyblock.
//
// TODO(rays): Before this will work well, we'll need to adjust the
//             polyblob tighter around the text near images, as in:
//             UNLV_AUTO:mag.3G0  page 2
//             UNLV_AUTO:mag.3G4  page 16
void BLOCK::compute_row_margins() {
  if (row_list()->empty() || row_list()->singleton()) {
    return;
  }

  // If Layout analysis was not called, default to this.
  POLY_BLOCK rect_block(bounding_box(), PT_FLOWING_TEXT);
  POLY_BLOCK *pblock = &rect_block;
  if (poly_block() != NULL) {
    pblock = poly_block();
  }

  // Step One: Determine if there is a drop-cap.
  //           TODO(eger): Fix up drop cap code for RTL languages.
  ROW_IT r_it(row_list());
  ROW *first_row = r_it.data();
  ROW *second_row = r_it.data_relative(1);

  // initialize the bottom of a fictitious drop cap far above the first line.
  int drop_cap_bottom = first_row->bounding_box().top() +
                        first_row->bounding_box().height();
  int drop_cap_right = first_row->bounding_box().left();
  int mid_second_line = second_row->bounding_box().top() -
                        second_row->bounding_box().height() / 2;
  WERD_IT werd_it(r_it.data()->word_list());  // words of line one
  if (!werd_it.empty()) {
    C_BLOB_IT cblob_it(werd_it.data()->cblob_list());
    for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list();
         cblob_it.forward()) {
      TBOX bbox = cblob_it.data()->bounding_box();
      if (bbox.bottom() <= mid_second_line) {
        // we found a real drop cap
        first_row->set_has_drop_cap(true);
        if (drop_cap_bottom >  bbox.bottom())
          drop_cap_bottom = bbox.bottom();
        if (drop_cap_right < bbox.right())
          drop_cap_right = bbox.right();
      }
    }
  }

  // Step Two: Calculate the margin from the text of each row to the block
  //           (or drop-cap) boundaries.
  PB_LINE_IT lines(pblock);
  r_it.set_to_list(row_list());
  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
    ROW *row = r_it.data();
    TBOX row_box = row->bounding_box();
    int left_y = row->base_line(row_box.left()) + row->x_height();
    int left_margin;
    ICOORDELT_LIST *segments = lines.get_line(left_y);
    LeftMargin(segments, row_box.left(), &left_margin);
    delete segments;

    if (row_box.top() >= drop_cap_bottom) {
      int drop_cap_distance = row_box.left() - row->space() - drop_cap_right;
      if (drop_cap_distance < 0)
        drop_cap_distance = 0;
      if (drop_cap_distance < left_margin)
        left_margin = drop_cap_distance;
    }

    int right_y = row->base_line(row_box.right()) + row->x_height();
    int right_margin;
    segments = lines.get_line(right_y);
    RightMargin(segments, row_box.right(), &right_margin);
    delete segments;
    row->set_lmargin(left_margin);
    row->set_rmargin(right_margin);
  }
}