Ejemplo n.º 1
0
void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST* real_rows) {
  TO_ROW_IT to_row_it(rows);
  TO_ROW* row = to_row_it.data();
  // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready
  // to create the word.
  C_BLOB_LIST cblobs;
  C_BLOB_IT cblob_it(&cblobs);
  BLOBNBOX_IT box_it(row->blob_list());
  for (;!box_it.empty(); box_it.forward()) {
    BLOBNBOX* bblob= box_it.extract();
    if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) {
      if (bblob->cblob() != NULL) {
        C_OUTLINE_IT cout_it(cblob_it.data()->out_list());
        cout_it.move_to_last();
        cout_it.add_list_after(bblob->cblob()->out_list());
        delete bblob->cblob();
      }
    } else {
      if (bblob->cblob() != NULL)
        cblob_it.add_after_then_move(bblob->cblob());
      delete bblob;
    }
  }
  // Convert the TO_ROW to a ROW.
  ROW* real_row = new ROW(row, static_cast<inT16>(row->kern_size),
                          static_cast<inT16>(row->space_size));
  WERD_IT word_it(real_row->word_list());
  WERD* word = new WERD(&cblobs, 0, NULL);
  word->set_flag(W_BOL, TRUE);
  word->set_flag(W_EOL, TRUE);
  word_it.add_after_then_move(word);
  ROW_IT row_it(real_rows);
  row_it.add_after_then_move(real_row);
}
Ejemplo n.º 2
0
WERD *add_repeated_word(                         //move repeated word
                        WERD_IT *rep_it,         //repeated words
                        int16_t &rep_left,         //left edge of word
                        int16_t &prev_chop_coord,  //previous word end
                        uint8_t &blanks,           //no of blanks
                        float pitch,             //char cell size
                        WERD_IT *word_it         //list of words
                       ) {
  WERD *word;                    //word to move
  int16_t new_blanks;              //extra blanks

  if (rep_left > prev_chop_coord) {
    new_blanks = (uint8_t) floor ((rep_left - prev_chop_coord) / pitch + 0.5);
    blanks += new_blanks;
  }
  word = rep_it->extract ();
  prev_chop_coord = word->bounding_box ().right ();
  word_it->add_after_then_move (word);
  word->set_blanks (blanks);
  rep_it->forward ();
  if (rep_it->empty ())
    rep_left = INT16_MAX;
  else
    rep_left = rep_it->data ()->bounding_box ().left ();
  blanks = 0;
  return word;
}
Ejemplo n.º 3
0
void PrintSegmentationStats(BLOCK_LIST* block_list) {
  int num_blocks = 0;
  int num_rows = 0;
  int num_words = 0;
  int num_blobs = 0;
  BLOCK_IT block_it(block_list);
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
    BLOCK* block = block_it.data();
    ++num_blocks;
    ROW_IT row_it(block->row_list());
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
      ++num_rows;
      ROW* row = row_it.data();
      // Iterate over all werds in the row.
      WERD_IT werd_it(row->word_list());
      for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) {
        WERD* werd = werd_it.data();
        ++num_words;
        num_blobs += werd->cblob_list()->length();
      }
    }
  }
  tprintf("Block list stats:\nBlocks = %d\nRows = %d\nWords = %d\nBlobs = %d\n",
          num_blocks, num_rows, num_words, num_blobs);
}
Ejemplo n.º 4
0
PAGE_RES_IT* make_pseudo_word(PAGE_RES* page_res, const TBOX& selection_box) {
  PAGE_RES_IT pr_it(page_res);
  C_BLOB_LIST new_blobs;               // list of gathered blobs
  C_BLOB_IT new_blob_it = &new_blobs;  // iterator

  for (WERD_RES* word_res = pr_it.word(); word_res != NULL;
       word_res = pr_it.forward()) {
    WERD* word = word_res->word;
    if (word->bounding_box().overlap(selection_box)) {
      C_BLOB_IT blob_it(word->cblob_list());
      for (blob_it.mark_cycle_pt();
           !blob_it.cycled_list(); blob_it.forward()) {
        C_BLOB* blob = blob_it.data();
        if (blob->bounding_box().overlap(selection_box)) {
          new_blob_it.add_after_then_move(C_BLOB::deep_copy(blob));
        }
      }
      if (!new_blobs.empty()) {
        WERD* pseudo_word = new WERD(&new_blobs, 1, NULL);
        word_res = pr_it.InsertSimpleCloneWord(*word_res, pseudo_word);
        PAGE_RES_IT* it = new PAGE_RES_IT(page_res);
        while (it->word() != word_res && it->word() != NULL) it->forward();
        ASSERT_HOST(it->word() == word_res);
        return it;
      }
    }
  }
  return NULL;
}
Ejemplo n.º 5
0
/// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
/// All fuzzy spaces are removed, and all the words are maximally chopped.
PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
                                     BLOCK_LIST *block_list) {
  PreenXHeights(block_list);
  // Strip all fuzzy space markers to simplify the PAGE_RES.
  BLOCK_IT b_it(block_list);
  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
    BLOCK* block = b_it.data();
    ROW_IT r_it(block->row_list());
    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
      ROW* row = r_it.data();
      WERD_IT w_it(row->word_list());
      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
        WERD* word = w_it.data();
        if (word->cblob_list()->empty()) {
          delete w_it.extract();
        } else {
          word->set_flag(W_FUZZY_SP, false);
          word->set_flag(W_FUZZY_NON, false);
        }
      }
    }
  }
  PAGE_RES* page_res = new PAGE_RES(false, block_list, NULL);
  PAGE_RES_IT pr_it(page_res);
  WERD_RES* word_res;
  while ((word_res = pr_it.word()) != NULL) {
    MaximallyChopWord(boxes, pr_it.block()->block,
                      pr_it.row()->row, word_res);
    pr_it.forward();
  }
  return page_res;
}
Ejemplo n.º 6
0
// Remove outlines that are a tiny fraction in either width or height
// of the word height.
void Textord::clean_small_noise_from_words(ROW *row) {
  WERD_IT word_it(row->word_list());
  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
    WERD* word = word_it.data();
    int min_size = static_cast<int>(
      textord_noise_hfract * word->bounding_box().height() + 0.5);
    C_BLOB_IT blob_it(word->cblob_list());
    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
      C_BLOB* blob = blob_it.data();
      C_OUTLINE_IT out_it(blob->out_list());
      for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
        C_OUTLINE* outline = out_it.data();
        outline->RemoveSmallRecursive(min_size, &out_it);
      }
      if (blob->out_list()->empty()) {
        delete blob_it.extract();
      }
    }
    if (word->cblob_list()->empty()) {
      if (!word_it.at_last()) {
        // The next word is no longer a fuzzy non space if it was before,
        // since the word before is about to be deleted.
        WERD* next_word = word_it.data_relative(1);
        if (next_word->flag(W_FUZZY_NON)) {
          next_word->set_flag(W_FUZZY_NON, false);
        }
      }
      delete word_it.extract();
    }
  }
}
Ejemplo n.º 7
0
//yangjing01 modified : 
bool TAL_make_single_word(bool one_blob, TO_ROW_LIST* rows, ROW_LIST* real_rows)
{
  TO_ROW_IT to_row_it(rows);
  ROW_IT row_it(real_rows);
  //to_real_row is the real row information of single row or single char mode
  TO_ROW* real_to_row = NULL;
  float row_max_height = 0.0;
  for (to_row_it.mark_cycle_pt();
    !to_row_it.cycled_list(); to_row_it.forward()){
    TO_ROW* row = to_row_it.data();
    float row_min_y = row->min_y();
    float row_max_y = row->max_y();
    float row_height = abs(row_max_y - row_min_y);
    if (real_to_row == NULL
      || row_height > row_max_height
      || fabs(row_height - row_max_height) < 1.0f){
      row_max_height = row_height;
      real_to_row = row;
    }
  }

  if (real_to_row == NULL){
    return false;
  }

  C_BLOB_LIST cblobs;
  C_BLOB_IT cblob_it(&cblobs);
  BLOBNBOX_IT box_it(real_to_row->blob_list());
  for (; !box_it.empty(); box_it.forward()){
    BLOBNBOX* bblob = box_it.extract();
    if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) {
      if (bblob->cblob() != NULL){
        C_OUTLINE_IT cout_it(cblob_it.data()->out_list());
        cout_it.move_to_last();
        cout_it.add_list_after(bblob->cblob()->out_list());
        delete bblob->cblob();
      }
    }
    else {
      if (bblob->cblob() != NULL)
        cblob_it.add_after_then_move(bblob->cblob());
    }
    delete bblob;
  }
  // Convert the TO_ROW to a ROW.
  ROW* real_row = new ROW(real_to_row, static_cast<inT16>(real_to_row->kern_size),
    static_cast<inT16>(real_to_row->space_size));
  WERD_IT word_it(real_row->word_list());
  WERD* word = new WERD(&cblobs, 0, NULL);
  word->set_flag(W_BOL, TRUE);
  word->set_flag(W_EOL, TRUE);
  word->set_flag(W_DONT_CHOP, one_blob);
  word_it.add_after_then_move(word);
  row_it.add_after_then_move(real_row);

  return true;
}
Ejemplo n.º 8
0
WERD *make_real_word(BLOBNBOX_IT *box_it,  //iterator
                     inT32 blobcount,      //no of blobs to use
                     BOOL8 bol,            //start of line
                     uinT8 blanks          //no of blanks
                    ) {
  OUTLINE_IT out_it;             // outlines
  C_OUTLINE_IT cout_it;
  PBLOB_LIST blobs;              // blobs in word
  C_BLOB_LIST cblobs;
  PBLOB_IT blob_it = &blobs;     // iterator
  C_BLOB_IT cblob_it = &cblobs;
  WERD *word;                    // new word
  BLOBNBOX *bblob;               // current blob
  inT32 blobindex;               // in row

  for (blobindex = 0; blobindex < blobcount; blobindex++) {
    bblob = box_it->extract();
    if (bblob->joined_to_prev()) {
      if (bblob->blob() != NULL) {
        out_it.set_to_list(blob_it.data()->out_list());
        out_it.move_to_last();
        out_it.add_list_after(bblob->blob()->out_list());
        delete bblob->blob();
      }
      else if (bblob->cblob() != NULL) {
        cout_it.set_to_list(cblob_it.data()->out_list());
        cout_it.move_to_last();
        cout_it.add_list_after(bblob->cblob()->out_list());
        delete bblob->cblob();
      }
    }
    else {
      if (bblob->blob() != NULL)
        blob_it.add_after_then_move(bblob->blob());
      else if (bblob->cblob() != NULL)
        cblob_it.add_after_then_move(bblob->cblob());
    }
    delete bblob;
    box_it->forward();          // next one
  }

  if (blanks < 1)
    blanks = 1;

  if (blob_it.empty())
    word = new WERD(&cblobs, blanks, NULL);
  else
    word = new WERD(&blobs, blanks, NULL);

  if (bol)
    word->set_flag(W_BOL, TRUE);
  if (box_it->at_first())
    word->set_flag(W_EOL, TRUE);  // at end of line

  return word;
}
Ejemplo n.º 9
0
void ROW::plot(               //draw it
               ScrollView* window  //window to draw in
              ) {
  WERD *word;                    //current word
  WERD_IT it = &words;           //words of ROW

  for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
    word = it.data ();
    word->plot (window);         //in rainbow colours
  }
}
Ejemplo n.º 10
0
void ROW::plot(                //draw it
               ScrollView* window,  //window to draw in
               ScrollView::Color colour   //colour to draw in
              ) {
  WERD *word;                    //current word
  WERD_IT it = &words;           //words of ROW

  for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
    word = it.data ();
    word->plot (window, colour); //all in one colour
  }
}
Ejemplo n.º 11
0
/**
 * @name process_selected_words()
 *
 * Walk the current block list applying the specified word processor function
 * to each word that overlaps the selection_box.
 */
void Tesseract::process_selected_words(
    PAGE_RES* page_res, // blocks to check
    TBOX & selection_box,
    BOOL8(tesseract::Tesseract::*word_processor)(PAGE_RES_IT* pr_it)) {
  for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != NULL;
       page_res_it.forward()) {
    WERD* word = page_res_it.word()->word;
    if (word->bounding_box().overlap(selection_box)) {
      if (!(this->*word_processor)(&page_res_it))
        return;
    }
  }
}
Ejemplo n.º 12
0
void apply_box_training(BLOCK_LIST *block_list) {
  BLOCK_IT block_it(block_list);
  ROW_IT row_it;
  ROW *row;
  WERD_IT word_it;
  WERD *word;
  WERD *bln_word;
  WERD copy_outword;             // copy to denorm
  PBLOB_IT blob_it;
  DENORM denorm;
  INT16 count = 0;
  char ch[2];

  ch[1] = '\0';

  tprintf ("Generating training data\n");
  for (block_it.mark_cycle_pt ();
  !block_it.cycled_list (); block_it.forward ()) {
    row_it.set_to_list (block_it.data ()->row_list ());
    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
      row = row_it.data ();
      word_it.set_to_list (row->word_list ());
      for (word_it.mark_cycle_pt ();
      !word_it.cycled_list (); word_it.forward ()) {
        word = word_it.data ();
        if ((strlen (word->text ()) == 1) &&
        (word->gblob_list ()->length () == 1)) {
          /* Here is a word with a single char label and a single blob so train on it */
          bln_word =
            make_bln_copy (word, row, row->x_height (), &denorm);
          blob_it.set_to_list (bln_word->blob_list ());
          ch[0] = *word->text ();
          tess_training_tester (blob_it.data (),
                                 //single blob
            &denorm, TRUE,       //correct
            ch,                  //correct ASCII char
            1,                   //ASCII length
            NULL);
          copy_outword = *(bln_word);
          copy_outword.baseline_denormalise (&denorm);
          blob_it.set_to_list (copy_outword.blob_list ());
          ch[0] = *word->text ();
          delete bln_word;
          count++;
        }
      }
    }
  }
  tprintf ("Generated training data for %d blobs\n", count);
}
Ejemplo n.º 13
0
WERD *make_real_word(BLOBNBOX_IT *box_it,  //iterator
                     int32_t blobcount,      //no of blobs to use
                     bool bol,            //start of line
                     uint8_t blanks          //no of blanks
                    ) {
  C_OUTLINE_IT cout_it;
  C_BLOB_LIST cblobs;
  C_BLOB_IT cblob_it = &cblobs;
  WERD *word;                    // new word
  BLOBNBOX *bblob;               // current blob
  int32_t blobindex;               // in row

  for (blobindex = 0; blobindex < blobcount; blobindex++) {
    bblob = box_it->extract();
    if (bblob->joined_to_prev()) {
      if (bblob->cblob() != nullptr) {
        cout_it.set_to_list(cblob_it.data()->out_list());
        cout_it.move_to_last();
        cout_it.add_list_after(bblob->cblob()->out_list());
        delete bblob->cblob();
      }
    }
    else {
      if (bblob->cblob() != nullptr)
        cblob_it.add_after_then_move(bblob->cblob());
    }
    delete bblob;
    box_it->forward();          // next one
  }

  if (blanks < 1)
    blanks = 1;

  word = new WERD(&cblobs, blanks, nullptr);

  if (bol)
    word->set_flag(W_BOL, true);
  if (box_it->at_first())
    word->set_flag(W_EOL, true);  // at end of line

  return word;
}
Ejemplo n.º 14
0
void RefreshWordBlobsFromNewBlobs(BLOCK_LIST* block_list,
                                  C_BLOB_LIST* new_blobs,
                                  C_BLOB_LIST* not_found_blobs) {
  // Now iterate over all the blobs in the segmentation_block_list_, and just
  // replace the corresponding c-blobs inside the werds.
  BLOCK_IT block_it(block_list);
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
    BLOCK* block = block_it.data();
    if (block->poly_block() != NULL && !block->poly_block()->IsText())
      continue;  // Don't touch non-text blocks.
    // Iterate over all rows in the block.
    ROW_IT row_it(block->row_list());
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
      ROW* row = row_it.data();
      // Iterate over all werds in the row.
      WERD_IT werd_it(row->word_list());
      WERD_LIST new_words;
      WERD_IT new_words_it(&new_words);
      for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) {
        WERD* werd = werd_it.extract();
        WERD* new_werd = werd->ConstructWerdWithNewBlobs(new_blobs,
                                                         not_found_blobs);
        if (new_werd) {
          // Insert this new werd into the actual row's werd-list. Remove the
          // existing one.
          new_words_it.add_after_then_move(new_werd);
          delete werd;
        } else {
          // Reinsert the older word back, for lack of better options.
          // This is critical since dropping the words messes up segmentation:
          // eg. 1st word in the row might otherwise have W_FUZZY_NON turned on.
          new_words_it.add_after_then_move(werd);
        }
      }
      // Get rid of the old word list & replace it with the new one.
      row->word_list()->clear();
      werd_it.move_to_first();
      werd_it.add_list_after(&new_words);
    }
  }
}
Ejemplo n.º 15
0
/**
 * Returns the baseline of the current object at the given level.
 * The baseline is the line that passes through (x1, y1) and (x2, y2).
 * WARNING: with vertical text, baselines may be vertical!
 */
bool PageIterator::Baseline(PageIteratorLevel level,
                            int* x1, int* y1, int* x2, int* y2) const {
  if (it_->word() == NULL) return false;  // Already at the end!
  ROW* row = it_->row()->row;
  WERD* word = it_->word()->word;
  TBOX box = (level == RIL_WORD || level == RIL_SYMBOL)
           ? word->bounding_box()
           : row->bounding_box();
  int left = box.left();
  ICOORD startpt(left, static_cast<inT16>(row->base_line(left) + 0.5));
  int right = box.right();
  ICOORD endpt(right, static_cast<inT16>(row->base_line(right) + 0.5));
  // Rotate to image coordinates and convert to global image coords.
  startpt.rotate(it_->block()->block->re_rotation());
  endpt.rotate(it_->block()->block->re_rotation());
  *x1 = startpt.x() / scale_ + rect_left_;
  *y1 = (rect_height_ - startpt.y()) / scale_ + rect_top_;
  *x2 = endpt.x() / scale_ + rect_left_;
  *y2 = (rect_height_ - endpt.y()) / scale_ + rect_top_;
  return true;
}
Ejemplo n.º 16
0
void ExtractBlobsFromSegmentation(BLOCK_LIST* blocks,
                                  C_BLOB_LIST* output_blob_list) {
  C_BLOB_IT return_list_it(output_blob_list);
  BLOCK_IT block_it(blocks);
  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
    BLOCK* block = block_it.data();
    ROW_IT row_it(block->row_list());
    for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
      ROW* row = row_it.data();
      // Iterate over all werds in the row.
      WERD_IT werd_it(row->word_list());
      for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) {
        WERD* werd = werd_it.data();
        return_list_it.move_to_last();
        return_list_it.add_list_after(werd->cblob_list());
        return_list_it.move_to_last();
        return_list_it.add_list_after(werd->rej_cblob_list());
      }
    }
  }
}
Ejemplo n.º 17
0
// Fixes the block so it obeys all the rules:
// Must have at least one ROW.
// Must have at least one WERD.
// WERDs contain a fake blob.
void Textord::cleanup_nontext_block(BLOCK* block) {
  // Non-text blocks must contain at least one row.
  ROW_IT row_it(block->row_list());
  if (row_it.empty()) {
    const TBOX& box = block->pdblk.bounding_box();
    float height = box.height();
    int32_t xstarts[2] = {box.left(), box.right()};
    double coeffs[3] = {0.0, 0.0, static_cast<double>(box.bottom())};
    ROW* row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f,
                       height / 4.0f, 0, 1);
    row_it.add_after_then_move(row);
  }
  // Each row must contain at least one word.
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
    ROW* row = row_it.data();
    WERD_IT w_it(row->word_list());
    if (w_it.empty()) {
      // Make a fake blob to put in the word.
      TBOX box = block->row_list()->singleton() ? block->pdblk.bounding_box()
                                                : row->bounding_box();
      C_BLOB* blob = C_BLOB::FakeBlob(box);
      C_BLOB_LIST blobs;
      C_BLOB_IT blob_it(&blobs);
      blob_it.add_after_then_move(blob);
      WERD* word = new WERD(&blobs, 0, nullptr);
      w_it.add_after_then_move(word);
    }
    // Each word must contain a fake blob.
    for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
      WERD* word = w_it.data();
      // Just assert that this is true, as it would be useful to find
      // out why it isn't.
      ASSERT_HOST(!word->cblob_list()->empty());
    }
    row->recalc_bounding_box();
  }
}
Ejemplo n.º 18
0
/**
 * word_set_display()  Word processor
 *
 * Display word according to current display mode settings
 */
BOOL8 Tesseract::word_set_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
  WERD* word = word_res->word;
  word->set_display_flag(DF_BOX, word_display_mode.bit(DF_BOX));
  word->set_display_flag(DF_TEXT, word_display_mode.bit(DF_TEXT));
  word->set_display_flag(DF_POLYGONAL, word_display_mode.bit(DF_POLYGONAL));
  word->set_display_flag(DF_EDGE_STEP, word_display_mode.bit(DF_EDGE_STEP));
  word->set_display_flag(DF_BN_POLYGONAL,
    word_display_mode.bit(DF_BN_POLYGONAL));
  word->set_display_flag(DF_BLAMER, word_display_mode.bit(DF_BLAMER));
  return word_display(block, row, word_res);
}
Ejemplo n.º 19
0
/**
 * word_set_display()  Word processor
 *
 * Display word according to current display mode settings
 */
BOOL8 Tesseract::word_set_display(PAGE_RES_IT* pr_it) {
  WERD* word = pr_it->word()->word;
  word->set_display_flag(DF_BOX, word_display_mode.bit(DF_BOX));
  word->set_display_flag(DF_TEXT, word_display_mode.bit(DF_TEXT));
  word->set_display_flag(DF_POLYGONAL, word_display_mode.bit(DF_POLYGONAL));
  word->set_display_flag(DF_EDGE_STEP, word_display_mode.bit(DF_EDGE_STEP));
  word->set_display_flag(DF_BN_POLYGONAL,
    word_display_mode.bit(DF_BN_POLYGONAL));
  word->set_display_flag(DF_BLAMER, word_display_mode.bit(DF_BLAMER));
  return word_display(pr_it);
}
Ejemplo n.º 20
0
void ROW::recalc_bounding_box() {  //recalculate BB
  WERD *word;                    //current word
  WERD_IT it = &words;           //words of ROW
  inT16 left;                    //of word
  inT16 prev_left;               //old left

  if (!it.empty ()) {
    word = it.data ();
    prev_left = word->bounding_box ().left ();
    it.forward ();
    while (!it.at_first ()) {
      word = it.data ();
      left = word->bounding_box ().left ();
      if (left < prev_left) {
        it.move_to_first ();
                                 //words in BB order
        it.sort (word_comparator);
        break;
      }
      prev_left = left;
      it.forward ();
    }
  }
  for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
    word = it.data ();
    if (it.at_first ())
      word->set_flag (W_BOL, TRUE);
    else
                                 //not start of line
      word->set_flag (W_BOL, FALSE);
    if (it.at_last ())
      word->set_flag (W_EOL, TRUE);
    else
                                 //not end of line
      word->set_flag (W_EOL, FALSE);
                                 //extend BB as reqd
    bound_box += word->bounding_box ();
  }
}
Ejemplo n.º 21
0
/// Resegments the words by running the classifier in an attempt to find the
/// correct segmentation that produces the required string.
void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) {
  PAGE_RES_IT pr_it(page_res);
  WERD_RES* word_res;
  for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
    WERD* word = word_res->word;
    if (word->text() == NULL || word->text()[0] == '\0')
      continue;  // Ignore words that have no text.
    // Convert the correct text to a vector of UNICHAR_ID
    GenericVector<UNICHAR_ID> target_text;
    if (!ConvertStringToUnichars(word->text(), &target_text)) {
      tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
              word->text());
      pr_it.DeleteCurrentWord();
      continue;
    }
    if (!FindSegmentation(target_text, word_res)) {
      tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
              word->text());
      pr_it.DeleteCurrentWord();
      continue;
    }
  }
}
Ejemplo n.º 22
0
/**********************************************************************
 * fixed_pitch_words
 *
 * Make a ROW from a fixed pitch TO_ROW.
 **********************************************************************/
ROW *fixed_pitch_words(                 //find lines
                       TO_ROW *row,     //row to do
                       FCOORD rotation  //for drawing
                      ) {
  bool bol;                     //start of line
  uint8_t blanks;                  //in front of word
  uint8_t new_blanks;              //blanks in empty cell
  int16_t chop_coord;              //chop boundary
  int16_t prev_chop_coord;         //start of cell
  int16_t rep_left;                //left edge of rep word
  ROW *real_row;                 //output row
  C_OUTLINE_LIST left_coutlines;
  C_OUTLINE_LIST right_coutlines;
  C_BLOB_LIST cblobs;
  C_BLOB_IT cblob_it = &cblobs;
  WERD_LIST words;
  WERD_IT word_it = &words;      //new words
                                 //repeated blobs
  WERD_IT rep_it = &row->rep_words;
  WERD *word;                    //new word
  int32_t xstarts[2];              //row ends
  int32_t prev_x;                  //end of prev blob
                                 //iterator
  BLOBNBOX_IT box_it = row->blob_list ();
                                 //boundaries
  ICOORDELT_IT cell_it = &row->char_cells;

#ifndef GRAPHICS_DISABLED
  if (textord_show_page_cuts && to_win != nullptr) {
    plot_row_cells (to_win, ScrollView::RED, row, 0, &row->char_cells);
  }
#endif

  prev_x = -INT16_MAX;
  bol = true;
  blanks = 0;
  if (rep_it.empty ())
    rep_left = INT16_MAX;
  else
    rep_left = rep_it.data ()->bounding_box ().left ();
  if (box_it.empty ())
    return nullptr;                 //empty row
  xstarts[0] = box_it.data ()->bounding_box ().left ();
  if (rep_left < xstarts[0]) {
    xstarts[0] = rep_left;
  }
  if (cell_it.empty () || row->char_cells.singleton ()) {
    tprintf ("Row without enough char cells!\n");
    tprintf ("Leftmost blob is at (%d,%d)\n",
      box_it.data ()->bounding_box ().left (),
      box_it.data ()->bounding_box ().bottom ());
    return nullptr;
  }
  ASSERT_HOST (!cell_it.empty () && !row->char_cells.singleton ());
  prev_chop_coord = cell_it.data ()->x ();
  word = nullptr;
  while (rep_left < cell_it.data ()->x ()) {
    word = add_repeated_word (&rep_it, rep_left, prev_chop_coord,
      blanks, row->fixed_pitch, &word_it);
  }
  cell_it.mark_cycle_pt ();
  if (prev_chop_coord >= cell_it.data ()->x ())
    cell_it.forward ();
  for (; !cell_it.cycled_list (); cell_it.forward ()) {
    chop_coord = cell_it.data ()->x ();
    while (!box_it.empty ()
    && box_it.data ()->bounding_box ().left () <= chop_coord) {
      if (box_it.data ()->bounding_box ().right () > prev_x)
        prev_x = box_it.data ()->bounding_box ().right ();
      split_to_blob (box_it.extract (), chop_coord,
        textord_fp_chop_error + 0.5f,
        &left_coutlines,
        &right_coutlines);
      box_it.forward ();
      while (!box_it.empty() && box_it.data()->cblob() == nullptr) {
        delete box_it.extract();
        box_it.forward();
      }
    }
    if (!right_coutlines.empty() && left_coutlines.empty())
      split_to_blob (nullptr, chop_coord,
        textord_fp_chop_error + 0.5f,
        &left_coutlines,
        &right_coutlines);
    if (!left_coutlines.empty()) {
      cblob_it.add_after_then_move(new C_BLOB(&left_coutlines));
    } else {
      if (rep_left < chop_coord) {
        if (rep_left > prev_chop_coord)
          new_blanks = (uint8_t) floor ((rep_left - prev_chop_coord)
            / row->fixed_pitch + 0.5);
        else
          new_blanks = 0;
      }
      else {
        if (chop_coord > prev_chop_coord)
          new_blanks = (uint8_t) floor ((chop_coord - prev_chop_coord)
            / row->fixed_pitch + 0.5);
        else
          new_blanks = 0;
      }
      if (!cblob_it.empty()) {
        if (blanks < 1 && word != nullptr && !word->flag (W_REP_CHAR))
          blanks = 1;
        word = new WERD (&cblobs, blanks, nullptr);
        cblob_it.set_to_list (&cblobs);
        word->set_flag (W_DONT_CHOP, TRUE);
        word_it.add_after_then_move (word);
        if (bol) {
          word->set_flag (W_BOL, TRUE);
          bol = false;
        }
        blanks = new_blanks;
      }
      else
        blanks += new_blanks;
      while (rep_left < chop_coord) {
        word = add_repeated_word (&rep_it, rep_left, prev_chop_coord,
          blanks, row->fixed_pitch, &word_it);
      }
    }
    if (prev_chop_coord < chop_coord)
      prev_chop_coord = chop_coord;
  }
  if (!cblob_it.empty()) {
    word = new WERD(&cblobs, blanks, nullptr);
    word->set_flag (W_DONT_CHOP, TRUE);
    word_it.add_after_then_move (word);
    if (bol)
      word->set_flag (W_BOL, TRUE);
  }
  ASSERT_HOST (word != nullptr);
  while (!rep_it.empty ()) {
    add_repeated_word (&rep_it, rep_left, prev_chop_coord,
      blanks, row->fixed_pitch, &word_it);
  }
                                 //at end of line
  word_it.data ()->set_flag (W_EOL, TRUE);
  if (prev_chop_coord > prev_x)
    prev_x = prev_chop_coord;
  xstarts[1] = prev_x + 1;
  real_row = new ROW (row, (int16_t) row->kern_size, (int16_t) row->space_size);
  word_it.set_to_list (real_row->word_list ());
                                 //put words in row
  word_it.add_list_after (&words);
  real_row->recalc_bounding_box ();
  return real_row;
}
Ejemplo n.º 23
0
/// Consume all source blobs that strongly overlap the given box,
/// putting them into a new word, with the correct_text label.
/// Fights over which box owns which blobs are settled by
/// applying the blobs to box or next_box with the least non-overlap.
/// @return false if the box was in error, which can only be caused by
/// failing to find an overlapping blob for a box.
bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
                                 const TBOX& box, const TBOX& next_box,
                                 const char* correct_text) {
  if (applybox_debug > 1) {
    tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
  }
  WERD* new_word = NULL;
  BLOCK_IT b_it(block_list);
  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
    BLOCK* block = b_it.data();
    if (!box.major_overlap(block->bounding_box()))
      continue;
    ROW_IT r_it(block->row_list());
    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
      ROW* row = r_it.data();
      if (!box.major_overlap(row->bounding_box()))
        continue;
      WERD_IT w_it(row->word_list());
      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
        WERD* word = w_it.data();
        if (applybox_debug > 2) {
          tprintf("Checking word:");
          word->bounding_box().print();
        }
        if (word->text() != NULL && word->text()[0] != '\0')
          continue;  // Ignore words that are already done.
        if (!box.major_overlap(word->bounding_box()))
          continue;
        C_BLOB_IT blob_it(word->cblob_list());
        for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
             blob_it.forward()) {
          C_BLOB* blob = blob_it.data();
          TBOX blob_box = blob->bounding_box();
          if (!blob_box.major_overlap(box))
            continue;
          double current_box_miss_metric = BoxMissMetric(blob_box, box);
          double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
          if (applybox_debug > 2) {
            tprintf("Checking blob:");
            blob_box.print();
            tprintf("Current miss metric = %g, next = %g\n",
                    current_box_miss_metric, next_box_miss_metric);
          }
          if (current_box_miss_metric > next_box_miss_metric)
            continue;  // Blob is a better match for next box.
          if (applybox_debug > 2) {
            tprintf("Blob match: blob:");
            blob_box.print();
            tprintf("Matches box:");
            box.print();
            tprintf("With next box:");
            next_box.print();
          }
          if (new_word == NULL) {
            // Make a new word with a single blob.
            new_word = word->shallow_copy();
            new_word->set_text(correct_text);
            w_it.add_to_end(new_word);
          }
          C_BLOB_IT new_blob_it(new_word->cblob_list());
          new_blob_it.add_to_end(blob_it.extract());
        }
      }
    }
  }
  if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n");
  return new_word != NULL;
}
Ejemplo n.º 24
0
/**
 *  word_display()  Word Processor
 *
 *  Display a word according to its display modes
 */
BOOL8 Tesseract::word_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
  WERD* word = word_res->word;
  TBOX word_bb;                   // word bounding box
  int word_height;               // ht of word BB
  BOOL8 displayed_something = FALSE;
  float shift;                   // from bot left
  C_BLOB_IT c_it;                // cblob iterator

  if (color_mode != CM_RAINBOW && word_res->box_word != NULL) {
    BoxWord* box_word = word_res->box_word;
    int length = box_word->length();
    if (word_res->fontinfo == NULL) return false;
    const FontInfo& font_info = *word_res->fontinfo;
    for (int i = 0; i < length; ++i) {
      ScrollView::Color color = ScrollView::GREEN;
      switch (color_mode) {
        case CM_SUBSCRIPT:
          if (box_word->BlobPosition(i) == SP_SUBSCRIPT)
            color = ScrollView::RED;
          break;
        case CM_SUPERSCRIPT:
          if (box_word->BlobPosition(i) == SP_SUPERSCRIPT)
            color = ScrollView::RED;
          break;
        case CM_ITALIC:
          if (font_info.is_italic())
            color = ScrollView::RED;
          break;
        case CM_BOLD:
          if (font_info.is_bold())
            color = ScrollView::RED;
          break;
        case CM_FIXEDPITCH:
          if (font_info.is_fixed_pitch())
            color = ScrollView::RED;
          break;
        case CM_SERIF:
          if (font_info.is_serif())
            color = ScrollView::RED;
          break;
        case CM_SMALLCAPS:
          if (word_res->small_caps)
            color = ScrollView::RED;
          break;
        case CM_DROPCAPS:
          if (box_word->BlobPosition(i) == SP_DROPCAP)
            color = ScrollView::RED;
          break;
          // TODO(rays) underline is currently completely unsupported.
        case CM_UNDERLINE:
        default:
          break;
      }
      image_win->Pen(color);
      TBOX box = box_word->BlobBox(i);
      image_win->Rectangle(box.left(), box.bottom(), box.right(), box.top());
    }
    return true;
  }
  /*
    Note the double coercions of(COLOUR)((inT32)editor_image_word_bb_color)
    etc. are to keep the compiler happy.
  */
                                 // display bounding box
  if (word->display_flag(DF_BOX)) {
    word->bounding_box().plot(image_win,
     (ScrollView::Color)((inT32)
      editor_image_word_bb_color),
     (ScrollView::Color)((inT32)
      editor_image_word_bb_color));

    ScrollView::Color c = (ScrollView::Color)
       ((inT32) editor_image_blob_bb_color);
    image_win->Pen(c);
    c_it.set_to_list(word->cblob_list());
    for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward())
      c_it.data()->bounding_box().plot(image_win);
    displayed_something = TRUE;
  }

                                 // display edge steps
  if (word->display_flag(DF_EDGE_STEP)) {     // edgesteps available
    word->plot(image_win);      // rainbow colors
    displayed_something = TRUE;
  }

                                 // display poly approx
  if (word->display_flag(DF_POLYGONAL)) {
                                 // need to convert
    TWERD* tword = TWERD::PolygonalCopy(word);
    tword->plot(image_win);
    delete tword;
    displayed_something = TRUE;
  }

  // Display correct text and blamer information.
  STRING text;
  STRING blame;
  if (word->display_flag(DF_TEXT) && word->text() != NULL) {
    text = word->text();
  }
  if (word->display_flag(DF_BLAMER) &&
      !(word_res->blamer_bundle != NULL &&
        word_res->blamer_bundle->incorrect_result_reason == IRR_CORRECT)) {
    text = "";
    const BlamerBundle *blamer_bundle = word_res->blamer_bundle;
    if (blamer_bundle == NULL) {
      text += "NULL";
    } else {
      for (int i = 0; i < blamer_bundle->truth_text.length(); ++i) {
        text += blamer_bundle->truth_text[i];
      }
    }
    text += " -> ";
    STRING best_choice_str;
    if (word_res->best_choice == NULL) {
      best_choice_str = "NULL";
    } else {
      word_res->best_choice->string_and_lengths(&best_choice_str, NULL);
    }
    text += best_choice_str;
    IncorrectResultReason reason = (blamer_bundle == NULL) ?
        IRR_PAGE_LAYOUT : blamer_bundle->incorrect_result_reason;
    ASSERT_HOST(reason < IRR_NUM_REASONS)
    blame += " [";
    blame += BlamerBundle::IncorrectReasonName(reason);
    blame += "]";
  }
  if (text.length() > 0) {
    word_bb = word->bounding_box();
    image_win->Pen(ScrollView::RED);
    word_height = word_bb.height();
    int text_height = 0.50 * word_height;
    if (text_height > 20) text_height = 20;
    image_win->TextAttributes("Arial", text_height, false, false, false);
    shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f;
    image_win->Text(word_bb.left() + shift,
                    word_bb.bottom() + 0.25 * word_height, text.string());
    if (blame.length() > 0) {
      image_win->Text(word_bb.left() + shift,
                      word_bb.bottom() + 0.25 * word_height - text_height,
                      blame.string());
    }

    displayed_something = TRUE;
  }

  if (!displayed_something)      // display BBox anyway
    word->bounding_box().plot(image_win,
     (ScrollView::Color)((inT32) editor_image_word_bb_color),
     (ScrollView::Color)((inT32)
      editor_image_word_bb_color));
  return TRUE;
}
Ejemplo n.º 25
0
void apply_box_testing(BLOCK_LIST *block_list) {
  BLOCK_IT block_it(block_list);
  ROW_IT row_it;
  ROW *row;
  INT16 row_count = 0;
  WERD_IT word_it;
  WERD *word;
  WERD *bln_word;
  INT16 word_count = 0;
  PBLOB_IT blob_it;
  DENORM denorm;
  INT16 count = 0;
  char ch[2];
  WERD *outword;                 //bln best choice
  //segmentation
  WERD_CHOICE *best_choice;      //tess output
  WERD_CHOICE *raw_choice;       //top choice permuter
                                 //detailed results
  BLOB_CHOICE_LIST_CLIST blob_choices;
  INT16 char_count = 0;
  INT16 correct_count = 0;
  INT16 err_count = 0;
  INT16 rej_count = 0;
  #ifndef SECURE_NAMES
  WERDSTATS wordstats;           //As from newdiff
  #endif
  char tess_rej_str[3];
  char tess_long_str[3];

  ch[1] = '\0';
  strcpy (tess_rej_str, "|A");
  strcpy (tess_long_str, "|B");

  for (block_it.mark_cycle_pt ();
  !block_it.cycled_list (); block_it.forward ()) {
    row_it.set_to_list (block_it.data ()->row_list ());
    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
      row = row_it.data ();
      row_count++;
      word_count = 0;
      word_it.set_to_list (row->word_list ());
      for (word_it.mark_cycle_pt ();
      !word_it.cycled_list (); word_it.forward ()) {
        word = word_it.data ();
        word_count++;
        if ((strlen (word->text ()) == 1) &&
          !STRING (applybox_test_exclusions).contains (*word->text ())
        && (word->gblob_list ()->length () == 1)) {
          /* Here is a word with a single char label and a single blob so test it */
          bln_word =
            make_bln_copy (word, row, row->x_height (), &denorm);
          blob_it.set_to_list (bln_word->blob_list ());
          ch[0] = *word->text ();
          char_count++;
          best_choice = tess_segment_pass1 (bln_word,
            &denorm,
            tess_default_matcher,
            raw_choice,
            &blob_choices, outword);

          /*
            Test for TESS screw up on word. Recog_word has already ensured that the
            choice list, outword blob lists and best_choice string are the same
            length. A TESS screw up is indicated by a blank filled or 0 length string.
          */
          if ((best_choice->string ().length () == 0) ||
            (strspn (best_choice->string ().string (), " ") ==
          best_choice->string ().length ())) {
            rej_count++;
            tprintf ("%d:%d: \"%s\" -> TESS FAILED\n",
              row_count, word_count, ch);
            #ifndef SECURE_NAMES
            wordstats.word (tess_rej_str, 2, ch, 1);
            #endif
          }
          else {
            if ((best_choice->string ().length () !=
              outword->blob_list ()->length ()) ||
              (best_choice->string ().length () !=
            blob_choices.length ())) {
              tprintf
                ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
                best_choice->string ().string (),
                best_choice->string ().length (),
                outword->blob_list ()->length (),
                blob_choices.length ());
            }
            ASSERT_HOST (best_choice->string ().length () ==
              outword->blob_list ()->length ());
            ASSERT_HOST (best_choice->string ().length () ==
              blob_choices.length ());
            fix_quotes ((char *) best_choice->string ().string (),
                                 //turn to double
              outword, &blob_choices);
            if (strcmp (best_choice->string ().string (), ch) != 0) {
              err_count++;
              tprintf ("%d:%d: \"%s\" -> \"%s\"\n",
                row_count, word_count, ch,
                best_choice->string ().string ());
            }
            else
              correct_count++;
            #ifndef SECURE_NAMES
            if (best_choice->string ().length () > 2)
              wordstats.word (tess_long_str, 2, ch, 1);
            else
              wordstats.word ((char *) best_choice->string ().
                string (),
                best_choice->string ().length (), ch,
                1);
            #endif
          }
          delete bln_word;
          delete outword;
          delete best_choice;
          delete raw_choice;
          blob_choices.deep_clear ();
          count++;
        }
      }
    }
  }
  #ifndef SECURE_NAMES
  wordstats.print (1, 100.0);
  wordstats.conf_matrix ();
  tprintf ("Tested %d chars: %d correct; %d rejected by tess; %d errs\n",
    char_count, correct_count, rej_count, err_count);
  #endif
}
Ejemplo n.º 26
0
/*************************************************************************
 * tidy_up()
 *   - report >1 block
 *   - sort the words in each row.
 *   - report any rows with no labelled words.
 *   - report any remaining unlabelled words
 *		- report total labelled words
 *
 *************************************************************************/
void tidy_up(                         //
             BLOCK_LIST *block_list,  //real blocks
             INT16 &ok_char_count,
             INT16 &ok_row_count,
             INT16 &unlabelled_words,
             INT16 *tgt_char_counts,
             INT16 &rebalance_count,
             char &min_char,
             INT16 &min_samples,
             INT16 &final_labelled_blob_count) {
  BLOCK_IT block_it(block_list);
  ROW_IT row_it;
  ROW *row;
  WERD_IT word_it;
  WERD *word;
  WERD *duplicate_word;
  INT16 block_idx = 0;
  INT16 row_idx;
  INT16 all_row_idx = 0;
  BOOL8 row_ok;
  BOOL8 rebalance_needed = FALSE;
                                 //No. of unique labelled samples
  INT16 labelled_char_counts[128];
  INT16 i;
  char ch;
  char prev_ch = '\0';
  BOOL8 at_dupe_of_prev_word;
  ROW *prev_row = NULL;
  INT16 left;
  INT16 prev_left = -1;

  for (i = 0; i < 128; i++)
    labelled_char_counts[i] = 0;

  ok_char_count = 0;
  ok_row_count = 0;
  unlabelled_words = 0;
  if ((applybox_debug > 4) && (block_it.length () != 1))

    tprintf ("APPLY_BOXES: More than one block??\n");

  for (block_it.mark_cycle_pt ();
  !block_it.cycled_list (); block_it.forward ()) {
    block_idx++;
    row_idx = 0;
    row_ok = FALSE;
    row_it.set_to_list (block_it.data ()->row_list ());
    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
      row_idx++;
      all_row_idx++;
      row = row_it.data ();
      word_it.set_to_list (row->word_list ());
      word_it.sort (word_comparator);
      for (word_it.mark_cycle_pt ();
      !word_it.cycled_list (); word_it.forward ()) {
        word = word_it.data ();
        if (strlen (word->text ()) == 0) {
          unlabelled_words++;
          if (applybox_debug > 4) {
            tprintf
              ("APPLY_BOXES: Unlabelled word blk:%d row:%d allrows:%d\n",
              block_idx, row_idx, all_row_idx);
          }
        }
        else {
          if (word->gblob_list ()->length () != 1)
            tprintf
              ("APPLY_BOXES: FATALITY - MULTIBLOB Labelled word blk:%d row:%d allrows:%d\n",
              block_idx, row_idx, all_row_idx);

          ok_char_count++;
          labelled_char_counts[*word->text ()]++;
          row_ok = TRUE;
        }
      }
      if ((applybox_debug > 4) && (!row_ok)) {
        tprintf
          ("APPLY_BOXES: Row with no labelled words blk:%d row:%d allrows:%d\n",
          block_idx, row_idx, all_row_idx);
      }
      else
        ok_row_count++;
    }
  }

  min_samples = 9999;
  for (i = 0; i < 128; i++) {
    if (tgt_char_counts[i] > labelled_char_counts[i]) {
      if (labelled_char_counts[i] <= 1) {
        tprintf
          ("APPLY_BOXES: FATALITY - %d labelled samples of \"%c\" - target is %d\n",
          labelled_char_counts[i], (char) i, tgt_char_counts[i]);
      }
      else {
        rebalance_needed = TRUE;
        if (applybox_debug > 0)
          tprintf
            ("APPLY_BOXES: REBALANCE REQD \"%c\" - target of %d from %d labelled samples\n",
            (char) i, tgt_char_counts[i], labelled_char_counts[i]);
      }
    }
    if ((min_samples > labelled_char_counts[i]) && (tgt_char_counts[i] > 0)) {
      min_samples = labelled_char_counts[i];
      min_char = (char) i;
    }
  }

  while (applybox_rebalance && rebalance_needed) {
    block_it.set_to_list (block_list);
    for (block_it.mark_cycle_pt ();
    !block_it.cycled_list (); block_it.forward ()) {
      row_it.set_to_list (block_it.data ()->row_list ());
      for (row_it.mark_cycle_pt ();
      !row_it.cycled_list (); row_it.forward ()) {
        row = row_it.data ();
        word_it.set_to_list (row->word_list ());
        for (word_it.mark_cycle_pt ();
        !word_it.cycled_list (); word_it.forward ()) {
          word = word_it.data ();
          left = word->bounding_box ().left ();
          ch = *word->text ();
          at_dupe_of_prev_word = ((row == prev_row) &&
            (left = prev_left) &&
            (ch == prev_ch));
          if ((ch != '\0') &&
            (labelled_char_counts[ch] > 1) &&
            (tgt_char_counts[ch] > labelled_char_counts[ch]) &&
          (!at_dupe_of_prev_word)) {
            /* Duplicate the word to rebalance the labelled samples */
            if (applybox_debug > 9) {
              tprintf ("Duping \"%c\" from ", ch);
              word->bounding_box ().print ();
            }
            duplicate_word = new WERD;
            *duplicate_word = *word;
            word_it.add_after_then_move (duplicate_word);
            rebalance_count++;
            labelled_char_counts[ch]++;
          }
          prev_row = row;
          prev_left = left;
          prev_ch = ch;
        }
      }
    }
    rebalance_needed = FALSE;
    for (i = 0; i < 128; i++) {
      if ((tgt_char_counts[i] > labelled_char_counts[i]) &&
      (labelled_char_counts[i] > 1)) {
        rebalance_needed = TRUE;
        break;
      }
    }
  }

  /* Now final check - count labelled blobs */
  final_labelled_blob_count = 0;
  block_it.set_to_list (block_list);
  for (block_it.mark_cycle_pt ();
  !block_it.cycled_list (); block_it.forward ()) {
    row_it.set_to_list (block_it.data ()->row_list ());
    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
      row = row_it.data ();
      word_it.set_to_list (row->word_list ());
      word_it.sort (word_comparator);
      for (word_it.mark_cycle_pt ();
      !word_it.cycled_list (); word_it.forward ()) {
        word = word_it.data ();
        if ((strlen (word->text ()) == 1) &&
          (word->gblob_list ()->length () == 1))
          final_labelled_blob_count++;
      }
    }
  }
}
Ejemplo n.º 27
0
INT16 resegment_box(  //
                    ROW *row,
                    BOX box,
                    char *ch,
                    INT16 block_id,
                    INT16 row_id,
                    INT16 boxfile_lineno,
                    INT16 boxfile_charno) {
  WERD_IT word_it;
  WERD *word;
  WERD *new_word = NULL;
  BOOL8 polyg = false;
  PBLOB_IT blob_it;
  PBLOB_IT new_blob_it;
  PBLOB *blob;
  PBLOB *new_blob;
  OUTLINE_IT outline_it;
  OUTLINE_LIST dummy;  // Just to initialize new_outline_it.
  OUTLINE_IT new_outline_it = &dummy;
  OUTLINE *outline;
  BOX new_word_box;
  float word_x_centre;
  float baseline;
  INT16 error_count = 0;         //number of chars lost

  word_it.set_to_list (row->word_list ());
  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
    word = word_it.data ();
    polyg = word->flag (W_POLYGON);
    if (word->bounding_box ().overlap (box)) {
      blob_it.set_to_list (word->gblob_list ());
      for (blob_it.mark_cycle_pt ();
      !blob_it.cycled_list (); blob_it.forward ()) {
        blob = blob_it.data ();
        if (gblob_bounding_box (blob, polyg).overlap (box)) {
          outline_it.set_to_list (gblob_out_list (blob, polyg));
          for (outline_it.mark_cycle_pt ();
          !outline_it.cycled_list (); outline_it.forward ()) {
            outline = outline_it.data ();
            if (goutline_bounding_box (outline, polyg).
            major_overlap (box)) {
              if (strlen (word->text ()) > 0) {
                if (error_count == 0) {
                  error_count = 1;
                  if (applybox_debug > 4)
                    report_failed_box (boxfile_lineno,
                      boxfile_charno,
                      box, ch,
                      "FAILURE! box overlaps blob in labelled word");
                }
                if (applybox_debug > 4)
                  tprintf
                    ("APPLY_BOXES: ALSO ignoring corrupted char blk:%d row:%d \"%s\"\n",
                    block_id, row_id,
                    word_it.data ()->text ());
                word_it.data ()->set_text ("");
                //UN label it
                error_count++;
              }

              if (error_count == 0) {
                if (new_word == NULL) {
                                 /* Make a new word with a single blob */
                  new_word = word->shallow_copy ();
                  new_word->set_text (ch);
                  if (polyg)
                    new_blob = new PBLOB;
                  else
                    new_blob = (PBLOB *) new C_BLOB;
                  new_blob_it.set_to_list (new_word->
                    gblob_list ());
                  new_blob_it.add_to_end (new_blob);
                  new_outline_it.
                    set_to_list (gblob_out_list
                    (new_blob, polyg));
                }
                new_outline_it.add_to_end (outline_it.
                  extract ());
                //move blob
              }
            }
          }
                                 //no outlines in blob
          if (outline_it.empty ())
                                 //so delete blob
            delete blob_it.extract ();
        }
      }
      if (blob_it.empty ())      //no blobs in word
                                 //so delete word
          delete word_it.extract ();
    }
  }
  if (error_count > 0)
    return error_count;

  if (new_word != NULL) {
    gblob_sort_list (new_word->gblob_list (), polyg);
    word_it.add_to_end (new_word);
    new_word_box = new_word->bounding_box ();
    word_x_centre = (new_word_box.left () + new_word_box.right ()) / 2.0f;
    baseline = row->base_line (word_x_centre);

    if (STRING (chs_caps_ht).contains (ch[0]) &&
      (new_word_box.top () <
    baseline + (1 + applybox_error_band) * row->x_height ())) {
      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
        "FAILURE! caps-ht char didn't ascend");
      new_word->set_text ("");
      return 1;
    }
    if (STRING (chs_odd_top).contains (ch[0]) &&
      (new_word_box.top () <
    baseline + (1 - applybox_error_band) * row->x_height ())) {
      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
        "FAILURE! Odd top char below xht");
      new_word->set_text ("");
      return 1;
    }
    if (STRING (chs_x_ht).contains (ch[0]) &&
      ((new_word_box.top () >
      baseline + (1 + applybox_error_band) * row->x_height ()) ||
      (new_word_box.top () <
    baseline + (1 - applybox_error_band) * row->x_height ()))) {
      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
        "FAILURE! x-ht char didn't have top near xht");
      new_word->set_text ("");
      return 1;
    }
    if (STRING (chs_non_ambig_bl).contains (ch[0]) &&
      ((new_word_box.bottom () <
      baseline - applybox_error_band * row->x_height ()) ||
      (new_word_box.bottom () >
    baseline + applybox_error_band * row->x_height ()))) {
      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
        "FAILURE! non ambig BL char didnt have bottom near baseline");
      new_word->set_text ("");
      return 1;
    }
    if (STRING (chs_odd_bot).contains (ch[0]) &&
      (new_word_box.bottom () >
    baseline + applybox_error_band * row->x_height ())) {
      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
        "FAILURE! Odd bottom char above baseline");
      new_word->set_text ("");
      return 1;
    }
    if (STRING (chs_desc).contains (ch[0]) &&
      (new_word_box.bottom () >
    baseline - applybox_error_band * row->x_height ())) {
      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
        "FAILURE! Descender doesn't descend");
      new_word->set_text ("");
      return 1;
    }
    return 0;
  }
  else {
    report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
      "FAILURE! Couldn't find any blobs");
    return 1;
  }
}
Ejemplo n.º 28
0
ROW *find_row_of_box(                         //
                     BLOCK_LIST *block_list,  //real blocks
                     BOX box,                 //from boxfile
                     INT16 &block_id,
                     INT16 &row_id_to_process) {
  BLOCK_IT block_it(block_list);
  BLOCK *block;
  ROW_IT row_it;
  ROW *row;
  ROW *row_to_process = NULL;
  INT16 row_id;
  WERD_IT word_it;
  WERD *word;
  BOOL8 polyg;
  PBLOB_IT blob_it;
  PBLOB *blob;
  OUTLINE_IT outline_it;
  OUTLINE *outline;

  /*
    Find row to process - error if box REALLY overlaps more than one row. (I.e
    it overlaps blobs in the row - not just overlaps the bounding box of the
    whole row.)
  */

  block_id = 0;
  for (block_it.mark_cycle_pt ();
  !block_it.cycled_list (); block_it.forward ()) {
    block_id++;
    row_id = 0;
    block = block_it.data ();
    if (block->bounding_box ().overlap (box)) {
      row_it.set_to_list (block->row_list ());
      for (row_it.mark_cycle_pt ();
      !row_it.cycled_list (); row_it.forward ()) {
        row_id++;
        row = row_it.data ();
        if (row->bounding_box ().overlap (box)) {
          word_it.set_to_list (row->word_list ());
          for (word_it.mark_cycle_pt ();
          !word_it.cycled_list (); word_it.forward ()) {
            word = word_it.data ();
            polyg = word->flag (W_POLYGON);
            if (word->bounding_box ().overlap (box)) {
              blob_it.set_to_list (word->gblob_list ());
              for (blob_it.mark_cycle_pt ();
              !blob_it.cycled_list (); blob_it.forward ()) {
                blob = blob_it.data ();
                if (gblob_bounding_box (blob, polyg).
                overlap (box)) {
                  outline_it.
                    set_to_list (gblob_out_list
                    (blob, polyg));
                  for (outline_it.mark_cycle_pt ();
                    !outline_it.cycled_list ();
                  outline_it.forward ()) {
                    outline = outline_it.data ();
                    if (goutline_bounding_box
                    (outline, polyg).major_overlap (box)) {
                      if ((row_to_process == NULL) ||
                      (row_to_process == row)) {
                        row_to_process = row;
                        row_id_to_process = row_id;
                      }
                      else
                        /* RETURN ERROR Box overlaps blobs in more than one row  */
                        return NULL;
                    }
                  }
                }
              }
            }
          }
        }
      }
    }
  }
  return row_to_process;
}
Ejemplo n.º 29
0
bool Textord::clean_noise_from_row(          //remove empties
        ROW* row  //row to clean
) {
  bool testing_on;
  TBOX blob_box;                 //bounding box
  C_BLOB *blob;                  //current blob
  C_OUTLINE *outline;            //current outline
  WERD *word;                    //current word
  int32_t blob_size;             //biggest size
  int32_t trans_count = 0;       //no of transitions
  int32_t trans_threshold;       //noise tolerance
  int32_t dot_count;             //small objects
  int32_t norm_count;            //normal objects
  int32_t super_norm_count;      //real char-like
                                 //words of row
  WERD_IT word_it = row->word_list ();
  C_BLOB_IT blob_it;             //blob iterator
  C_OUTLINE_IT out_it;           //outline iterator

  testing_on = textord_test_y > row->base_line (textord_test_x)
               && textord_show_blobs
               && textord_test_y < row->base_line (textord_test_x) + row->x_height ();
  dot_count = 0;
  norm_count = 0;
  super_norm_count = 0;
  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
    word = word_it.data ();      //current word
                                 //blobs in word
    blob_it.set_to_list (word->cblob_list ());
    for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
    blob_it.forward ()) {
      blob = blob_it.data ();
      if (!word->flag (W_DONT_CHOP)) {
                                 //get outlines
        out_it.set_to_list (blob->out_list ());
        for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
        out_it.forward ()) {
          outline = out_it.data ();
          blob_box = outline->bounding_box ();
          blob_size =
            blob_box.width () >
            blob_box.height ()? blob_box.width () : blob_box.
            height();
          if (blob_size < textord_noise_sizelimit * row->x_height ())
            dot_count++;         //count smal outlines
          if (!outline->child ()->empty ()
            && blob_box.height () <
            (1 + textord_noise_syfract) * row->x_height ()
            && blob_box.height () >
            (1 - textord_noise_syfract) * row->x_height ()
            && blob_box.width () <
            (1 + textord_noise_sxfract) * row->x_height ()
            && blob_box.width () >
            (1 - textord_noise_sxfract) * row->x_height ())
            super_norm_count++;  //count smal outlines
        }
      }
      else
        super_norm_count++;
      blob_box = blob->bounding_box ();
      blob_size =
        blob_box.width () >
        blob_box.height ()? blob_box.width () : blob_box.height ();
      if (blob_size >= textord_noise_sizelimit * row->x_height ()
          && blob_size < row->x_height () * 2) {
        trans_threshold = blob_size / textord_noise_sizefraction;
        trans_count = blob->count_transitions (trans_threshold);
        if (trans_count < textord_noise_translimit)
          norm_count++;
      }
      else if (blob_box.height () > row->x_height () * 2
        && (!word_it.at_first () || !blob_it.at_first ()))
        dot_count += 2;
      if (testing_on) {
        tprintf
          ("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n",
          blob_box.left (), blob_box.bottom (), blob_box.right (),
          blob_box.top (), blob->out_list ()->length (), trans_count,
          blob_box.bottom () - row->base_line (blob_box.left ()));
      }
    }
  }
  if (textord_noise_debug) {
    tprintf ("Row ending at (%d,%g):",
      blob_box.right (), row->base_line (blob_box.right ()));
    tprintf (" R=%g, dc=%d, nc=%d, %s\n",
      norm_count > 0 ? (float) dot_count / norm_count : 9999,
      dot_count, norm_count,
      dot_count > norm_count * textord_noise_normratio
      && dot_count > 2 ? "REJECTED" : "ACCEPTED");
  }
  return super_norm_count < textord_noise_sncount
    && dot_count > norm_count * textord_noise_rowratio && dot_count > 2;
}
Ejemplo n.º 30
0
void Textord::clean_noise_from_words(          //remove empties
                                     ROW *row  //row to clean
                                    ) {
  TBOX blob_box;                 //bounding box
  C_BLOB *blob;                  //current blob
  C_OUTLINE *outline;            //current outline
  WERD *word;                    //current word
  int32_t blob_size;             //biggest size
  int32_t trans_count;           //no of transitions
  int32_t trans_threshold;       //noise tolerance
  int32_t dot_count;             //small objects
  int32_t norm_count;            //normal objects
  int32_t dud_words;             //number discarded
  int32_t ok_words;              //number remaining
  int32_t word_index;            //current word
                                 //words of row
  WERD_IT word_it = row->word_list ();
  C_BLOB_IT blob_it;             //blob iterator
  C_OUTLINE_IT out_it;           //outline iterator

  ok_words = word_it.length ();
  if (ok_words == 0 || textord_no_rejects)
    return;
  // was it chucked
  std::vector<int8_t> word_dud(ok_words);
  dud_words = 0;
  ok_words = 0;
  word_index = 0;
  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
    word = word_it.data ();      //current word
    dot_count = 0;
    norm_count = 0;
                                 //blobs in word
    blob_it.set_to_list (word->cblob_list ());
    for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
    blob_it.forward ()) {
      blob = blob_it.data ();
      if (!word->flag (W_DONT_CHOP)) {
                                 //get outlines
        out_it.set_to_list (blob->out_list ());
        for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
        out_it.forward ()) {
          outline = out_it.data ();
          blob_box = outline->bounding_box ();
          blob_size =
            blob_box.width () >
            blob_box.height ()? blob_box.width () : blob_box.
            height();
          if (blob_size < textord_noise_sizelimit * row->x_height ())
            dot_count++;         //count smal outlines
          if (!outline->child ()->empty ()
            && blob_box.height () <
            (1 + textord_noise_syfract) * row->x_height ()
            && blob_box.height () >
            (1 - textord_noise_syfract) * row->x_height ()
            && blob_box.width () <
            (1 + textord_noise_sxfract) * row->x_height ()
            && blob_box.width () >
            (1 - textord_noise_sxfract) * row->x_height ())
            norm_count++;        //count smal outlines
        }
      }
      else
        norm_count++;
      blob_box = blob->bounding_box ();
      blob_size =
        blob_box.width () >
        blob_box.height ()? blob_box.width () : blob_box.height ();
      if (blob_size >= textord_noise_sizelimit * row->x_height ()
      && blob_size < row->x_height () * 2) {
        trans_threshold = blob_size / textord_noise_sizefraction;
        trans_count = blob->count_transitions (trans_threshold);
        if (trans_count < textord_noise_translimit)
          norm_count++;
      }
      else if (blob_box.height () > row->x_height () * 2
        && (!word_it.at_first () || !blob_it.at_first ()))
        dot_count += 2;
    }
    if (dot_count > 2 && !word->flag(W_REP_CHAR)) {
      if (dot_count > norm_count * textord_noise_normratio * 2)
        word_dud[word_index] = 2;
      else if (dot_count > norm_count * textord_noise_normratio)
        word_dud[word_index] = 1;
      else
        word_dud[word_index] = 0;
    } else {
      word_dud[word_index] = 0;
    }
    if (word_dud[word_index] == 2)
      dud_words++;
    else
      ok_words++;
    word_index++;
  }

  word_index = 0;
  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
    if (word_dud[word_index] == 2
    || (word_dud[word_index] == 1 && dud_words > ok_words)) {
      word = word_it.data();  // Current word.
      // Previously we threw away the entire word.
      // Now just aggressively throw all small blobs into the reject list, where
      // the classifier can decide whether they are actually needed.
      word->CleanNoise(textord_noise_sizelimit * row->x_height());
    }
    word_index++;
  }
}