示例#1
0
void BLOCK::compress() {  // squash it up
  #define           ROW_SPACING 5

  ROW_IT row_it(&rows);
  ROW *row;
  ICOORD row_spacing (0, ROW_SPACING);

  ICOORDELT_IT icoordelt_it;

  sort_rows();

  box = TBOX (box.topleft (), box.topleft ());
  box.move_bottom_edge (ROW_SPACING);
  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
    row = row_it.data ();
    row->move (box.botleft () - row_spacing -
      row->bounding_box ().topleft ());
    box += row->bounding_box ();
  }

  leftside.clear ();
  icoordelt_it.set_to_list (&leftside);
  icoordelt_it.add_to_end (new ICOORDELT (box.left (), box.bottom ()));
  icoordelt_it.add_to_end (new ICOORDELT (box.left (), box.top ()));
  rightside.clear ();
  icoordelt_it.set_to_list (&rightside);
  icoordelt_it.add_to_end (new ICOORDELT (box.right (), box.bottom ()));
  icoordelt_it.add_to_end (new ICOORDELT (box.right (), box.top ()));
}
示例#2
0
void PrintSegmentationStats(BLOCK_LIST* block_list) {
  int num_blocks = 0;
  int num_rows = 0;
  int num_words = 0;
  int num_blobs = 0;
  BLOCK_IT block_it(block_list);
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
    BLOCK* block = block_it.data();
    ++num_blocks;
    ROW_IT row_it(block->row_list());
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
      ++num_rows;
      ROW* row = row_it.data();
      // Iterate over all werds in the row.
      WERD_IT werd_it(row->word_list());
      for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) {
        WERD* werd = werd_it.data();
        ++num_words;
        num_blobs += werd->cblob_list()->length();
      }
    }
  }
  tprintf("Block list stats:\nBlocks = %d\nRows = %d\nWords = %d\nBlobs = %d\n",
          num_blocks, num_rows, num_words, num_blobs);
}
示例#3
0
void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST* real_rows) {
  TO_ROW_IT to_row_it(rows);
  TO_ROW* row = to_row_it.data();
  // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready
  // to create the word.
  C_BLOB_LIST cblobs;
  C_BLOB_IT cblob_it(&cblobs);
  BLOBNBOX_IT box_it(row->blob_list());
  for (;!box_it.empty(); box_it.forward()) {
    BLOBNBOX* bblob= box_it.extract();
    if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) {
      if (bblob->cblob() != NULL) {
        C_OUTLINE_IT cout_it(cblob_it.data()->out_list());
        cout_it.move_to_last();
        cout_it.add_list_after(bblob->cblob()->out_list());
        delete bblob->cblob();
      }
    } else {
      if (bblob->cblob() != NULL)
        cblob_it.add_after_then_move(bblob->cblob());
      delete bblob;
    }
  }
  // Convert the TO_ROW to a ROW.
  ROW* real_row = new ROW(row, static_cast<inT16>(row->kern_size),
                          static_cast<inT16>(row->space_size));
  WERD_IT word_it(real_row->word_list());
  WERD* word = new WERD(&cblobs, 0, NULL);
  word->set_flag(W_BOL, TRUE);
  word->set_flag(W_EOL, TRUE);
  word_it.add_after_then_move(word);
  ROW_IT row_it(real_rows);
  row_it.add_after_then_move(real_row);
}
示例#4
0
//yangjing01 modified : 
bool TAL_make_single_word(bool one_blob, TO_ROW_LIST* rows, ROW_LIST* real_rows)
{
  TO_ROW_IT to_row_it(rows);
  ROW_IT row_it(real_rows);
  //to_real_row is the real row information of single row or single char mode
  TO_ROW* real_to_row = NULL;
  float row_max_height = 0.0;
  for (to_row_it.mark_cycle_pt();
    !to_row_it.cycled_list(); to_row_it.forward()){
    TO_ROW* row = to_row_it.data();
    float row_min_y = row->min_y();
    float row_max_y = row->max_y();
    float row_height = abs(row_max_y - row_min_y);
    if (real_to_row == NULL
      || row_height > row_max_height
      || fabs(row_height - row_max_height) < 1.0f){
      row_max_height = row_height;
      real_to_row = row;
    }
  }

  if (real_to_row == NULL){
    return false;
  }

  C_BLOB_LIST cblobs;
  C_BLOB_IT cblob_it(&cblobs);
  BLOBNBOX_IT box_it(real_to_row->blob_list());
  for (; !box_it.empty(); box_it.forward()){
    BLOBNBOX* bblob = box_it.extract();
    if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) {
      if (bblob->cblob() != NULL){
        C_OUTLINE_IT cout_it(cblob_it.data()->out_list());
        cout_it.move_to_last();
        cout_it.add_list_after(bblob->cblob()->out_list());
        delete bblob->cblob();
      }
    }
    else {
      if (bblob->cblob() != NULL)
        cblob_it.add_after_then_move(bblob->cblob());
    }
    delete bblob;
  }
  // Convert the TO_ROW to a ROW.
  ROW* real_row = new ROW(real_to_row, static_cast<inT16>(real_to_row->kern_size),
    static_cast<inT16>(real_to_row->space_size));
  WERD_IT word_it(real_row->word_list());
  WERD* word = new WERD(&cblobs, 0, NULL);
  word->set_flag(W_BOL, TRUE);
  word->set_flag(W_EOL, TRUE);
  word->set_flag(W_DONT_CHOP, one_blob);
  word_it.add_after_then_move(word);
  row_it.add_after_then_move(real_row);

  return true;
}
示例#5
0
BaselineBlock::BaselineBlock(int debug_level, bool non_text, TO_BLOCK* block)
  : block_(block), debug_level_(debug_level), non_text_block_(non_text),
    good_skew_angle_(false), skew_angle_(0.0),
    line_spacing_(block->line_spacing), line_offset_(0.0), model_error_(0.0) {
  TO_ROW_IT row_it(block_->get_rows());
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
    // Sort the blobs on the rows.
    row_it.data()->blob_list()->sort(blob_x_order);
    rows_.push_back(new BaselineRow(block->line_spacing, row_it.data()));
  }
}
示例#6
0
// Helper computes median xheight in the image.
static double MedianXHeight(BLOCK_LIST *block_list) {
  BLOCK_IT block_it(block_list);
  STATS xheights(0, block_it.data()->bounding_box().height());
  for (block_it.mark_cycle_pt();
       !block_it.cycled_list(); block_it.forward()) {
    ROW_IT row_it(block_it.data()->row_list());
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
      xheights.add(IntCastRounded(row_it.data()->x_height()), 1);
    }
  }
  return xheights.median();
}
示例#7
0
static void clear_any_old_text(BLOCK_LIST *block_list) {
  BLOCK_IT block_it(block_list);
  for (block_it.mark_cycle_pt();
       !block_it.cycled_list(); block_it.forward()) {
    ROW_IT row_it(block_it.data()->row_list());
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
      WERD_IT word_it(row_it.data()->word_list());
      for (word_it.mark_cycle_pt();
           !word_it.cycled_list(); word_it.forward()) {
        word_it.data()->set_text("");
      }
    }
  }
}
示例#8
0
// Sets the parameters in TO_BLOCK that are needed by subsequent processes.
void BaselineBlock::SetupBlockParameters() const {
  if (line_spacing_ > 0.0) {
    // Where was block_line_spacing set before?
    float min_spacing = MIN(block_->line_spacing, line_spacing_);
    if (min_spacing < block_->line_size)
      block_->line_size = min_spacing;
    block_->line_spacing = line_spacing_;
    block_->baseline_offset = line_offset_;
    block_->max_blob_size = line_spacing_ * kMaxBlobSizeMultiple;
  }
  // Setup the parameters on all the rows.
  TO_ROW_IT row_it(block_->get_rows());
  for (int r = 0; r < rows_.size(); ++r, row_it.forward()) {
    BaselineRow* row = rows_[r];
    TO_ROW* to_row = row_it.data();
    row->SetupOldLineParameters(to_row);
  }
}
// This method resolves the cc bbox to a particular row and returns the row's
// xheight.
int ShiroRekhaSplitter::GetXheightForCC(Box* cc_bbox) {
  if (!segmentation_block_list_) {
    return global_xheight_;
  }
  // Compute the box coordinates in Tesseract's coordinate system.
  TBOX bbox(cc_bbox->x,
            pixGetHeight(orig_pix_) - cc_bbox->y - cc_bbox->h - 1,
            cc_bbox->x + cc_bbox->w,
            pixGetHeight(orig_pix_) - cc_bbox->y - 1);
  // Iterate over all blocks.
  BLOCK_IT block_it(segmentation_block_list_);
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
    BLOCK* block = block_it.data();
    // Iterate over all rows in the block.
    ROW_IT row_it(block->row_list());
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
      ROW* row = row_it.data();
      if (!row->bounding_box().major_overlap(bbox)) {
        continue;
      }
      // Row could be skewed, warped, etc. Use the position of the box to
      // determine the baseline position of the row for that x-coordinate.
      // Create a square TBOX whose baseline's mid-point lies at this point
      // and side is row's xheight. Take the overlap of this box with the input
      // box and check if it is a 'major overlap'. If so, this box lies in this
      // row. In that case, return the xheight for this row.
      float box_middle = 0.5 * (bbox.left() + bbox.right());
      int baseline = static_cast<int>(row->base_line(box_middle) + 0.5);
      TBOX test_box(box_middle - row->x_height() / 2,
                    baseline,
                    box_middle + row->x_height() / 2,
                    static_cast<int>(baseline + row->x_height()));
      // Compute overlap. If it is is a major overlap, this is the right row.
      if (bbox.major_overlap(test_box)) {
        return row->x_height();
      }
    }
  }
  // No row found for this bbox.
  return kUnspecifiedXheight;
}
示例#10
0
void RefreshWordBlobsFromNewBlobs(BLOCK_LIST* block_list,
                                  C_BLOB_LIST* new_blobs,
                                  C_BLOB_LIST* not_found_blobs) {
  // Now iterate over all the blobs in the segmentation_block_list_, and just
  // replace the corresponding c-blobs inside the werds.
  BLOCK_IT block_it(block_list);
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
    BLOCK* block = block_it.data();
    if (block->poly_block() != NULL && !block->poly_block()->IsText())
      continue;  // Don't touch non-text blocks.
    // Iterate over all rows in the block.
    ROW_IT row_it(block->row_list());
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
      ROW* row = row_it.data();
      // Iterate over all werds in the row.
      WERD_IT werd_it(row->word_list());
      WERD_LIST new_words;
      WERD_IT new_words_it(&new_words);
      for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) {
        WERD* werd = werd_it.extract();
        WERD* new_werd = werd->ConstructWerdWithNewBlobs(new_blobs,
                                                         not_found_blobs);
        if (new_werd) {
          // Insert this new werd into the actual row's werd-list. Remove the
          // existing one.
          new_words_it.add_after_then_move(new_werd);
          delete werd;
        } else {
          // Reinsert the older word back, for lack of better options.
          // This is critical since dropping the words messes up segmentation:
          // eg. 1st word in the row might otherwise have W_FUZZY_NON turned on.
          new_words_it.add_after_then_move(werd);
        }
      }
      // Get rid of the old word list & replace it with the new one.
      row->word_list()->clear();
      werd_it.move_to_first();
      werd_it.add_list_after(&new_words);
    }
  }
}
示例#11
0
void ExtractBlobsFromSegmentation(BLOCK_LIST* blocks,
                                  C_BLOB_LIST* output_blob_list) {
  C_BLOB_IT return_list_it(output_blob_list);
  BLOCK_IT block_it(blocks);
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
    BLOCK* block = block_it.data();
    ROW_IT row_it(block->row_list());
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
      ROW* row = row_it.data();
      // Iterate over all werds in the row.
      WERD_IT werd_it(row->word_list());
      for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) {
        WERD* werd = werd_it.data();
        return_list_it.move_to_last();
        return_list_it.add_list_after(werd->cblob_list());
        return_list_it.move_to_last();
        return_list_it.add_list_after(werd->rej_cblob_list());
      }
    }
  }
}
示例#12
0
// Fixes the block so it obeys all the rules:
// Must have at least one ROW.
// Must have at least one WERD.
// WERDs contain a fake blob.
void Textord::cleanup_nontext_block(BLOCK* block) {
  // Non-text blocks must contain at least one row.
  ROW_IT row_it(block->row_list());
  if (row_it.empty()) {
    const TBOX& box = block->pdblk.bounding_box();
    float height = box.height();
    int32_t xstarts[2] = {box.left(), box.right()};
    double coeffs[3] = {0.0, 0.0, static_cast<double>(box.bottom())};
    ROW* row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f,
                       height / 4.0f, 0, 1);
    row_it.add_after_then_move(row);
  }
  // Each row must contain at least one word.
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
    ROW* row = row_it.data();
    WERD_IT w_it(row->word_list());
    if (w_it.empty()) {
      // Make a fake blob to put in the word.
      TBOX box = block->row_list()->singleton() ? block->pdblk.bounding_box()
                                                : row->bounding_box();
      C_BLOB* blob = C_BLOB::FakeBlob(box);
      C_BLOB_LIST blobs;
      C_BLOB_IT blob_it(&blobs);
      blob_it.add_after_then_move(blob);
      WERD* word = new WERD(&blobs, 0, nullptr);
      w_it.add_after_then_move(word);
    }
    // Each word must contain a fake blob.
    for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
      WERD* word = w_it.data();
      // Just assert that this is true, as it would be useful to find
      // out why it isn't.
      ASSERT_HOST(!word->cblob_list()->empty());
    }
    row->recalc_bounding_box();
  }
}
示例#13
0
void BLOCK::sort_rows() {  // order on "top"
  ROW_IT row_it(&rows);

  row_it.sort (decreasing_top_order);
}
示例#14
0
// Groups blocks by rotation, then, for each group, makes a WordGrid and calls
// TransferDiacriticsToWords to copy the diacritic blobs to the most
// appropriate words in the group of blocks. Source blobs are not touched.
void Textord::TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs,
                                              BLOCK_LIST* blocks) {
  // Angle difference larger than this is too much to consider equal.
  // They should only be in multiples of M_PI/2 anyway.
  const double kMaxAngleDiff = 0.01;  // About 0.6 degrees.
  PointerVector<BlockGroup> groups;
  BLOCK_IT bk_it(blocks);
  for (bk_it.mark_cycle_pt(); !bk_it.cycled_list(); bk_it.forward()) {
    BLOCK* block = bk_it.data();
    if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) {
      continue;
    }
    // Linear search of the groups to find a matching rotation.
    float block_angle = block->re_rotation().angle();
    int best_g = 0;
    float best_angle_diff = MAX_FLOAT32;
    for (int g = 0; g < groups.size(); ++g) {
      double angle_diff = fabs(block_angle - groups[g]->angle);
      if (angle_diff > M_PI) angle_diff = fabs(angle_diff - 2.0 * M_PI);
      if (angle_diff < best_angle_diff) {
        best_angle_diff = angle_diff;
        best_g = g;
      }
    }
    if (best_angle_diff > kMaxAngleDiff) {
      groups.push_back(new BlockGroup(block));
    } else {
      groups[best_g]->blocks.push_back(block);
      groups[best_g]->bounding_box += block->pdblk.bounding_box();
      float x_height = block->x_height();
      if (x_height < groups[best_g]->min_xheight)
        groups[best_g]->min_xheight = x_height;
    }
  }
  // Now process each group of blocks.
  PointerVector<WordWithBox> word_ptrs;
  for (int g = 0; g < groups.size(); ++g) {
    const BlockGroup* group = groups[g];
    if (group->bounding_box.null_box()) continue;
    WordGrid word_grid(group->min_xheight, group->bounding_box.botleft(),
                       group->bounding_box.topright());
    for (int b = 0; b < group->blocks.size(); ++b) {
      ROW_IT row_it(group->blocks[b]->row_list());
      for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
        ROW* row = row_it.data();
        // Put the words of the row into the grid.
        WERD_IT w_it(row->word_list());
        for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
          WERD* word = w_it.data();
          WordWithBox* box_word = new WordWithBox(word);
          word_grid.InsertBBox(true, true, box_word);
          // Save the pointer where it will be auto-deleted.
          word_ptrs.push_back(box_word);
        }
      }
    }
    FCOORD rotation = group->rotation;
    // Make it a forward rotation that will transform blob coords to block.
    rotation.set_y(-rotation.y());
    TransferDiacriticsToWords(diacritic_blobs, rotation, &word_grid);
  }
}
示例#15
0
GAPMAP::GAPMAP(                 //Constructor
               TO_BLOCK *block  //block
              ) {
  TO_ROW *row;                   //current row
  BLOBNBOX_IT blob_it;           //iterator
  TBOX blob_box;
  TBOX prev_blob_box;
  int16_t gap_width;
  int16_t start_of_row;
  int16_t end_of_row;
  STATS xht_stats (0, 128);
  int16_t min_quantum;
  int16_t max_quantum;
  int16_t i;

  /*
    Find left and right extremes and bucket size
  */
  map = nullptr;
  min_left = INT16_MAX;
  max_right = -INT16_MAX;
  total_rows = 0;
  any_tabs = false;

  // row iterator
  TO_ROW_IT row_it(block->get_rows());
  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
    row = row_it.data ();
    if (!row->blob_list ()->empty ()) {
      total_rows++;
      xht_stats.add (static_cast<int16_t>(floor (row->xheight + 0.5)), 1);
      blob_it.set_to_list (row->blob_list ());
      start_of_row = blob_it.data ()->bounding_box ().left ();
      end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
      if (min_left > start_of_row)
        min_left = start_of_row;
      if (max_right < end_of_row)
        max_right = end_of_row;
    }
  }
  if ((total_rows < 3) || (min_left >= max_right)) {
    bucket_size = 0;
    map_max = 0;
    total_rows = 0;
    min_left = max_right = 0;
    return;
  }
  bucket_size = static_cast<int16_t>(floor (xht_stats.median () + 0.5)) / 2;
  map_max = (max_right - min_left) / bucket_size;
  map = new int16_t[map_max + 1];
  for (i = 0; i <= map_max; i++)
    map[i] = 0;

  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
    row = row_it.data ();
    if (!row->blob_list ()->empty ()) {
      blob_it.set_to_list (row->blob_list ());
      blob_it.mark_cycle_pt ();
      blob_box = box_next (&blob_it);
      prev_blob_box = blob_box;
      if (gapmap_use_ends) {
        /* Leading space */
        gap_width = blob_box.left () - min_left;
        if ((gap_width > gapmap_big_gaps * row->xheight)
        && gap_width > 2) {
          max_quantum = (blob_box.left () - min_left) / bucket_size;
          if (max_quantum > map_max) max_quantum = map_max;
            for (i = 0; i <= max_quantum; i++)
            map[i]++;
        }
      }
      while (!blob_it.cycled_list ()) {
        blob_box = box_next (&blob_it);
        gap_width = blob_box.left () - prev_blob_box.right ();
        if ((gap_width > gapmap_big_gaps * row->xheight)
        && gap_width > 2) {
          min_quantum =
            (prev_blob_box.right () - min_left) / bucket_size;
          max_quantum = (blob_box.left () - min_left) / bucket_size;
          if (max_quantum > map_max) max_quantum = map_max;
          for (i = min_quantum; i <= max_quantum; i++)
            map[i]++;
        }
        prev_blob_box = blob_box;
      }
      if (gapmap_use_ends) {
        /* Trailing space */
        gap_width = max_right - prev_blob_box.right ();
        if ((gap_width > gapmap_big_gaps * row->xheight)
        && gap_width > 2) {
          min_quantum =
            (prev_blob_box.right () - min_left) / bucket_size;
          if (min_quantum < 0) min_quantum = 0;
          for (i = min_quantum; i <= map_max; i++)
            map[i]++;
        }
      }
    }
  }
  for (i = 0; i <= map_max; i++) {
    if (map[i] > total_rows / 2) {
      if (gapmap_no_isolated_quanta &&
        (((i == 0) &&
        (map[i + 1] <= total_rows / 2)) ||
        ((i == map_max) &&
        (map[i - 1] <= total_rows / 2)) ||
        ((i > 0) &&
        (i < map_max) &&
        (map[i - 1] <= total_rows / 2) &&
      (map[i + 1] <= total_rows / 2)))) {
        map[i] = 0;              //prevent isolated quantum
      }
      else
        any_tabs = true;
    }
  }
  if (gapmap_debug && any_tabs)
    tprintf ("Table found\n");
}