Esempio n. 1
0
INT16 resegment_box(  //
                    ROW *row,
                    BOX box,
                    char *ch,
                    INT16 block_id,
                    INT16 row_id,
                    INT16 boxfile_lineno,
                    INT16 boxfile_charno) {
  WERD_IT word_it;
  WERD *word;
  WERD *new_word = NULL;
  BOOL8 polyg = false;
  PBLOB_IT blob_it;
  PBLOB_IT new_blob_it;
  PBLOB *blob;
  PBLOB *new_blob;
  OUTLINE_IT outline_it;
  OUTLINE_LIST dummy;  // Just to initialize new_outline_it.
  OUTLINE_IT new_outline_it = &dummy;
  OUTLINE *outline;
  BOX new_word_box;
  float word_x_centre;
  float baseline;
  INT16 error_count = 0;         //number of chars lost

  word_it.set_to_list (row->word_list ());
  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
    word = word_it.data ();
    polyg = word->flag (W_POLYGON);
    if (word->bounding_box ().overlap (box)) {
      blob_it.set_to_list (word->gblob_list ());
      for (blob_it.mark_cycle_pt ();
      !blob_it.cycled_list (); blob_it.forward ()) {
        blob = blob_it.data ();
        if (gblob_bounding_box (blob, polyg).overlap (box)) {
          outline_it.set_to_list (gblob_out_list (blob, polyg));
          for (outline_it.mark_cycle_pt ();
          !outline_it.cycled_list (); outline_it.forward ()) {
            outline = outline_it.data ();
            if (goutline_bounding_box (outline, polyg).
            major_overlap (box)) {
              if (strlen (word->text ()) > 0) {
                if (error_count == 0) {
                  error_count = 1;
                  if (applybox_debug > 4)
                    report_failed_box (boxfile_lineno,
                      boxfile_charno,
                      box, ch,
                      "FAILURE! box overlaps blob in labelled word");
                }
                if (applybox_debug > 4)
                  tprintf
                    ("APPLY_BOXES: ALSO ignoring corrupted char blk:%d row:%d \"%s\"\n",
                    block_id, row_id,
                    word_it.data ()->text ());
                word_it.data ()->set_text ("");
                //UN label it
                error_count++;
              }

              if (error_count == 0) {
                if (new_word == NULL) {
                                 /* Make a new word with a single blob */
                  new_word = word->shallow_copy ();
                  new_word->set_text (ch);
                  if (polyg)
                    new_blob = new PBLOB;
                  else
                    new_blob = (PBLOB *) new C_BLOB;
                  new_blob_it.set_to_list (new_word->
                    gblob_list ());
                  new_blob_it.add_to_end (new_blob);
                  new_outline_it.
                    set_to_list (gblob_out_list
                    (new_blob, polyg));
                }
                new_outline_it.add_to_end (outline_it.
                  extract ());
                //move blob
              }
            }
          }
                                 //no outlines in blob
          if (outline_it.empty ())
                                 //so delete blob
            delete blob_it.extract ();
        }
      }
      if (blob_it.empty ())      //no blobs in word
                                 //so delete word
          delete word_it.extract ();
    }
  }
  if (error_count > 0)
    return error_count;

  if (new_word != NULL) {
    gblob_sort_list (new_word->gblob_list (), polyg);
    word_it.add_to_end (new_word);
    new_word_box = new_word->bounding_box ();
    word_x_centre = (new_word_box.left () + new_word_box.right ()) / 2.0f;
    baseline = row->base_line (word_x_centre);

    if (STRING (chs_caps_ht).contains (ch[0]) &&
      (new_word_box.top () <
    baseline + (1 + applybox_error_band) * row->x_height ())) {
      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
        "FAILURE! caps-ht char didn't ascend");
      new_word->set_text ("");
      return 1;
    }
    if (STRING (chs_odd_top).contains (ch[0]) &&
      (new_word_box.top () <
    baseline + (1 - applybox_error_band) * row->x_height ())) {
      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
        "FAILURE! Odd top char below xht");
      new_word->set_text ("");
      return 1;
    }
    if (STRING (chs_x_ht).contains (ch[0]) &&
      ((new_word_box.top () >
      baseline + (1 + applybox_error_band) * row->x_height ()) ||
      (new_word_box.top () <
    baseline + (1 - applybox_error_band) * row->x_height ()))) {
      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
        "FAILURE! x-ht char didn't have top near xht");
      new_word->set_text ("");
      return 1;
    }
    if (STRING (chs_non_ambig_bl).contains (ch[0]) &&
      ((new_word_box.bottom () <
      baseline - applybox_error_band * row->x_height ()) ||
      (new_word_box.bottom () >
    baseline + applybox_error_band * row->x_height ()))) {
      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
        "FAILURE! non ambig BL char didnt have bottom near baseline");
      new_word->set_text ("");
      return 1;
    }
    if (STRING (chs_odd_bot).contains (ch[0]) &&
      (new_word_box.bottom () >
    baseline + applybox_error_band * row->x_height ())) {
      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
        "FAILURE! Odd bottom char above baseline");
      new_word->set_text ("");
      return 1;
    }
    if (STRING (chs_desc).contains (ch[0]) &&
      (new_word_box.bottom () >
    baseline - applybox_error_band * row->x_height ())) {
      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
        "FAILURE! Descender doesn't descend");
      new_word->set_text ("");
      return 1;
    }
    return 0;
  }
  else {
    report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
      "FAILURE! Couldn't find any blobs");
    return 1;
  }
}
Esempio n. 2
0
/// Consume all source blobs that strongly overlap the given box,
/// putting them into a new word, with the correct_text label.
/// Fights over which box owns which blobs are settled by
/// applying the blobs to box or next_box with the least non-overlap.
/// @return false if the box was in error, which can only be caused by
/// failing to find an overlapping blob for a box.
bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
                                 const TBOX& box, const TBOX& next_box,
                                 const char* correct_text) {
  if (applybox_debug > 1) {
    tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
  }
  WERD* new_word = NULL;
  BLOCK_IT b_it(block_list);
  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
    BLOCK* block = b_it.data();
    if (!box.major_overlap(block->bounding_box()))
      continue;
    ROW_IT r_it(block->row_list());
    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
      ROW* row = r_it.data();
      if (!box.major_overlap(row->bounding_box()))
        continue;
      WERD_IT w_it(row->word_list());
      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
        WERD* word = w_it.data();
        if (applybox_debug > 2) {
          tprintf("Checking word:");
          word->bounding_box().print();
        }
        if (word->text() != NULL && word->text()[0] != '\0')
          continue;  // Ignore words that are already done.
        if (!box.major_overlap(word->bounding_box()))
          continue;
        C_BLOB_IT blob_it(word->cblob_list());
        for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
             blob_it.forward()) {
          C_BLOB* blob = blob_it.data();
          TBOX blob_box = blob->bounding_box();
          if (!blob_box.major_overlap(box))
            continue;
          double current_box_miss_metric = BoxMissMetric(blob_box, box);
          double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
          if (applybox_debug > 2) {
            tprintf("Checking blob:");
            blob_box.print();
            tprintf("Current miss metric = %g, next = %g\n",
                    current_box_miss_metric, next_box_miss_metric);
          }
          if (current_box_miss_metric > next_box_miss_metric)
            continue;  // Blob is a better match for next box.
          if (applybox_debug > 2) {
            tprintf("Blob match: blob:");
            blob_box.print();
            tprintf("Matches box:");
            box.print();
            tprintf("With next box:");
            next_box.print();
          }
          if (new_word == NULL) {
            // Make a new word with a single blob.
            new_word = word->shallow_copy();
            new_word->set_text(correct_text);
            w_it.add_to_end(new_word);
          }
          C_BLOB_IT new_blob_it(new_word->cblob_list());
          new_blob_it.add_to_end(blob_it.extract());
        }
      }
    }
  }
  if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n");
  return new_word != NULL;
}