Пример #1
0
void apply_box_training(BLOCK_LIST *block_list) {
  BLOCK_IT block_it(block_list);
  ROW_IT row_it;
  ROW *row;
  WERD_IT word_it;
  WERD *word;
  WERD *bln_word;
  WERD copy_outword;             // copy to denorm
  PBLOB_IT blob_it;
  DENORM denorm;
  INT16 count = 0;
  char ch[2];

  ch[1] = '\0';

  tprintf ("Generating training data\n");
  for (block_it.mark_cycle_pt ();
  !block_it.cycled_list (); block_it.forward ()) {
    row_it.set_to_list (block_it.data ()->row_list ());
    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
      row = row_it.data ();
      word_it.set_to_list (row->word_list ());
      for (word_it.mark_cycle_pt ();
      !word_it.cycled_list (); word_it.forward ()) {
        word = word_it.data ();
        if ((strlen (word->text ()) == 1) &&
        (word->gblob_list ()->length () == 1)) {
          /* Here is a word with a single char label and a single blob so train on it */
          bln_word =
            make_bln_copy (word, row, row->x_height (), &denorm);
          blob_it.set_to_list (bln_word->blob_list ());
          ch[0] = *word->text ();
          tess_training_tester (blob_it.data (),
                                 //single blob
            &denorm, TRUE,       //correct
            ch,                  //correct ASCII char
            1,                   //ASCII length
            NULL);
          copy_outword = *(bln_word);
          copy_outword.baseline_denormalise (&denorm);
          blob_it.set_to_list (copy_outword.blob_list ());
          ch[0] = *word->text ();
          delete bln_word;
          count++;
        }
      }
    }
  }
  tprintf ("Generated training data for %d blobs\n", count);
}
Пример #2
0
/// Resegments the words by running the classifier in an attempt to find the
/// correct segmentation that produces the required string.
void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) {
  PAGE_RES_IT pr_it(page_res);
  WERD_RES* word_res;
  for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
    WERD* word = word_res->word;
    if (word->text() == NULL || word->text()[0] == '\0')
      continue;  // Ignore words that have no text.
    // Convert the correct text to a vector of UNICHAR_ID
    GenericVector<UNICHAR_ID> target_text;
    if (!ConvertStringToUnichars(word->text(), &target_text)) {
      tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
              word->text());
      pr_it.DeleteCurrentWord();
      continue;
    }
    if (!FindSegmentation(target_text, word_res)) {
      tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
              word->text());
      pr_it.DeleteCurrentWord();
      continue;
    }
  }
}
Пример #3
0
/**
 *  word_display()  Word Processor
 *
 *  Display a word according to its display modes
 */
BOOL8 Tesseract::word_display(BLOCK* block, ROW* row, WERD_RES* word_res) {
  WERD* word = word_res->word;
  TBOX word_bb;                   // word bounding box
  int word_height;               // ht of word BB
  BOOL8 displayed_something = FALSE;
  float shift;                   // from bot left
  C_BLOB_IT c_it;                // cblob iterator

  if (color_mode != CM_RAINBOW && word_res->box_word != NULL) {
    BoxWord* box_word = word_res->box_word;
    int length = box_word->length();
    if (word_res->fontinfo == NULL) return false;
    const FontInfo& font_info = *word_res->fontinfo;
    for (int i = 0; i < length; ++i) {
      ScrollView::Color color = ScrollView::GREEN;
      switch (color_mode) {
        case CM_SUBSCRIPT:
          if (box_word->BlobPosition(i) == SP_SUBSCRIPT)
            color = ScrollView::RED;
          break;
        case CM_SUPERSCRIPT:
          if (box_word->BlobPosition(i) == SP_SUPERSCRIPT)
            color = ScrollView::RED;
          break;
        case CM_ITALIC:
          if (font_info.is_italic())
            color = ScrollView::RED;
          break;
        case CM_BOLD:
          if (font_info.is_bold())
            color = ScrollView::RED;
          break;
        case CM_FIXEDPITCH:
          if (font_info.is_fixed_pitch())
            color = ScrollView::RED;
          break;
        case CM_SERIF:
          if (font_info.is_serif())
            color = ScrollView::RED;
          break;
        case CM_SMALLCAPS:
          if (word_res->small_caps)
            color = ScrollView::RED;
          break;
        case CM_DROPCAPS:
          if (box_word->BlobPosition(i) == SP_DROPCAP)
            color = ScrollView::RED;
          break;
          // TODO(rays) underline is currently completely unsupported.
        case CM_UNDERLINE:
        default:
          break;
      }
      image_win->Pen(color);
      TBOX box = box_word->BlobBox(i);
      image_win->Rectangle(box.left(), box.bottom(), box.right(), box.top());
    }
    return true;
  }
  /*
    Note the double coercions of(COLOUR)((inT32)editor_image_word_bb_color)
    etc. are to keep the compiler happy.
  */
                                 // display bounding box
  if (word->display_flag(DF_BOX)) {
    word->bounding_box().plot(image_win,
     (ScrollView::Color)((inT32)
      editor_image_word_bb_color),
     (ScrollView::Color)((inT32)
      editor_image_word_bb_color));

    ScrollView::Color c = (ScrollView::Color)
       ((inT32) editor_image_blob_bb_color);
    image_win->Pen(c);
    c_it.set_to_list(word->cblob_list());
    for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward())
      c_it.data()->bounding_box().plot(image_win);
    displayed_something = TRUE;
  }

                                 // display edge steps
  if (word->display_flag(DF_EDGE_STEP)) {     // edgesteps available
    word->plot(image_win);      // rainbow colors
    displayed_something = TRUE;
  }

                                 // display poly approx
  if (word->display_flag(DF_POLYGONAL)) {
                                 // need to convert
    TWERD* tword = TWERD::PolygonalCopy(word);
    tword->plot(image_win);
    delete tword;
    displayed_something = TRUE;
  }

  // Display correct text and blamer information.
  STRING text;
  STRING blame;
  if (word->display_flag(DF_TEXT) && word->text() != NULL) {
    text = word->text();
  }
  if (word->display_flag(DF_BLAMER) &&
      !(word_res->blamer_bundle != NULL &&
        word_res->blamer_bundle->incorrect_result_reason == IRR_CORRECT)) {
    text = "";
    const BlamerBundle *blamer_bundle = word_res->blamer_bundle;
    if (blamer_bundle == NULL) {
      text += "NULL";
    } else {
      for (int i = 0; i < blamer_bundle->truth_text.length(); ++i) {
        text += blamer_bundle->truth_text[i];
      }
    }
    text += " -> ";
    STRING best_choice_str;
    if (word_res->best_choice == NULL) {
      best_choice_str = "NULL";
    } else {
      word_res->best_choice->string_and_lengths(&best_choice_str, NULL);
    }
    text += best_choice_str;
    IncorrectResultReason reason = (blamer_bundle == NULL) ?
        IRR_PAGE_LAYOUT : blamer_bundle->incorrect_result_reason;
    ASSERT_HOST(reason < IRR_NUM_REASONS)
    blame += " [";
    blame += BlamerBundle::IncorrectReasonName(reason);
    blame += "]";
  }
  if (text.length() > 0) {
    word_bb = word->bounding_box();
    image_win->Pen(ScrollView::RED);
    word_height = word_bb.height();
    int text_height = 0.50 * word_height;
    if (text_height > 20) text_height = 20;
    image_win->TextAttributes("Arial", text_height, false, false, false);
    shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f;
    image_win->Text(word_bb.left() + shift,
                    word_bb.bottom() + 0.25 * word_height, text.string());
    if (blame.length() > 0) {
      image_win->Text(word_bb.left() + shift,
                      word_bb.bottom() + 0.25 * word_height - text_height,
                      blame.string());
    }

    displayed_something = TRUE;
  }

  if (!displayed_something)      // display BBox anyway
    word->bounding_box().plot(image_win,
     (ScrollView::Color)((inT32) editor_image_word_bb_color),
     (ScrollView::Color)((inT32)
      editor_image_word_bb_color));
  return TRUE;
}
Пример #4
0
void apply_box_testing(BLOCK_LIST *block_list) {
  BLOCK_IT block_it(block_list);
  ROW_IT row_it;
  ROW *row;
  INT16 row_count = 0;
  WERD_IT word_it;
  WERD *word;
  WERD *bln_word;
  INT16 word_count = 0;
  PBLOB_IT blob_it;
  DENORM denorm;
  INT16 count = 0;
  char ch[2];
  WERD *outword;                 //bln best choice
  //segmentation
  WERD_CHOICE *best_choice;      //tess output
  WERD_CHOICE *raw_choice;       //top choice permuter
                                 //detailed results
  BLOB_CHOICE_LIST_CLIST blob_choices;
  INT16 char_count = 0;
  INT16 correct_count = 0;
  INT16 err_count = 0;
  INT16 rej_count = 0;
  #ifndef SECURE_NAMES
  WERDSTATS wordstats;           //As from newdiff
  #endif
  char tess_rej_str[3];
  char tess_long_str[3];

  ch[1] = '\0';
  strcpy (tess_rej_str, "|A");
  strcpy (tess_long_str, "|B");

  for (block_it.mark_cycle_pt ();
  !block_it.cycled_list (); block_it.forward ()) {
    row_it.set_to_list (block_it.data ()->row_list ());
    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
      row = row_it.data ();
      row_count++;
      word_count = 0;
      word_it.set_to_list (row->word_list ());
      for (word_it.mark_cycle_pt ();
      !word_it.cycled_list (); word_it.forward ()) {
        word = word_it.data ();
        word_count++;
        if ((strlen (word->text ()) == 1) &&
          !STRING (applybox_test_exclusions).contains (*word->text ())
        && (word->gblob_list ()->length () == 1)) {
          /* Here is a word with a single char label and a single blob so test it */
          bln_word =
            make_bln_copy (word, row, row->x_height (), &denorm);
          blob_it.set_to_list (bln_word->blob_list ());
          ch[0] = *word->text ();
          char_count++;
          best_choice = tess_segment_pass1 (bln_word,
            &denorm,
            tess_default_matcher,
            raw_choice,
            &blob_choices, outword);

          /*
            Test for TESS screw up on word. Recog_word has already ensured that the
            choice list, outword blob lists and best_choice string are the same
            length. A TESS screw up is indicated by a blank filled or 0 length string.
          */
          if ((best_choice->string ().length () == 0) ||
            (strspn (best_choice->string ().string (), " ") ==
          best_choice->string ().length ())) {
            rej_count++;
            tprintf ("%d:%d: \"%s\" -> TESS FAILED\n",
              row_count, word_count, ch);
            #ifndef SECURE_NAMES
            wordstats.word (tess_rej_str, 2, ch, 1);
            #endif
          }
          else {
            if ((best_choice->string ().length () !=
              outword->blob_list ()->length ()) ||
              (best_choice->string ().length () !=
            blob_choices.length ())) {
              tprintf
                ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
                best_choice->string ().string (),
                best_choice->string ().length (),
                outword->blob_list ()->length (),
                blob_choices.length ());
            }
            ASSERT_HOST (best_choice->string ().length () ==
              outword->blob_list ()->length ());
            ASSERT_HOST (best_choice->string ().length () ==
              blob_choices.length ());
            fix_quotes ((char *) best_choice->string ().string (),
                                 //turn to double
              outword, &blob_choices);
            if (strcmp (best_choice->string ().string (), ch) != 0) {
              err_count++;
              tprintf ("%d:%d: \"%s\" -> \"%s\"\n",
                row_count, word_count, ch,
                best_choice->string ().string ());
            }
            else
              correct_count++;
            #ifndef SECURE_NAMES
            if (best_choice->string ().length () > 2)
              wordstats.word (tess_long_str, 2, ch, 1);
            else
              wordstats.word ((char *) best_choice->string ().
                string (),
                best_choice->string ().length (), ch,
                1);
            #endif
          }
          delete bln_word;
          delete outword;
          delete best_choice;
          delete raw_choice;
          blob_choices.deep_clear ();
          count++;
        }
      }
    }
  }
  #ifndef SECURE_NAMES
  wordstats.print (1, 100.0);
  wordstats.conf_matrix ();
  tprintf ("Tested %d chars: %d correct; %d rejected by tess; %d errs\n",
    char_count, correct_count, rej_count, err_count);
  #endif
}
Пример #5
0
/*************************************************************************
 * tidy_up()
 *   - report >1 block
 *   - sort the words in each row.
 *   - report any rows with no labelled words.
 *   - report any remaining unlabelled words
 *		- report total labelled words
 *
 *************************************************************************/
void tidy_up(                         //
             BLOCK_LIST *block_list,  //real blocks
             INT16 &ok_char_count,
             INT16 &ok_row_count,
             INT16 &unlabelled_words,
             INT16 *tgt_char_counts,
             INT16 &rebalance_count,
             char &min_char,
             INT16 &min_samples,
             INT16 &final_labelled_blob_count) {
  BLOCK_IT block_it(block_list);
  ROW_IT row_it;
  ROW *row;
  WERD_IT word_it;
  WERD *word;
  WERD *duplicate_word;
  INT16 block_idx = 0;
  INT16 row_idx;
  INT16 all_row_idx = 0;
  BOOL8 row_ok;
  BOOL8 rebalance_needed = FALSE;
                                 //No. of unique labelled samples
  INT16 labelled_char_counts[128];
  INT16 i;
  char ch;
  char prev_ch = '\0';
  BOOL8 at_dupe_of_prev_word;
  ROW *prev_row = NULL;
  INT16 left;
  INT16 prev_left = -1;

  for (i = 0; i < 128; i++)
    labelled_char_counts[i] = 0;

  ok_char_count = 0;
  ok_row_count = 0;
  unlabelled_words = 0;
  if ((applybox_debug > 4) && (block_it.length () != 1))

    tprintf ("APPLY_BOXES: More than one block??\n");

  for (block_it.mark_cycle_pt ();
  !block_it.cycled_list (); block_it.forward ()) {
    block_idx++;
    row_idx = 0;
    row_ok = FALSE;
    row_it.set_to_list (block_it.data ()->row_list ());
    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
      row_idx++;
      all_row_idx++;
      row = row_it.data ();
      word_it.set_to_list (row->word_list ());
      word_it.sort (word_comparator);
      for (word_it.mark_cycle_pt ();
      !word_it.cycled_list (); word_it.forward ()) {
        word = word_it.data ();
        if (strlen (word->text ()) == 0) {
          unlabelled_words++;
          if (applybox_debug > 4) {
            tprintf
              ("APPLY_BOXES: Unlabelled word blk:%d row:%d allrows:%d\n",
              block_idx, row_idx, all_row_idx);
          }
        }
        else {
          if (word->gblob_list ()->length () != 1)
            tprintf
              ("APPLY_BOXES: FATALITY - MULTIBLOB Labelled word blk:%d row:%d allrows:%d\n",
              block_idx, row_idx, all_row_idx);

          ok_char_count++;
          labelled_char_counts[*word->text ()]++;
          row_ok = TRUE;
        }
      }
      if ((applybox_debug > 4) && (!row_ok)) {
        tprintf
          ("APPLY_BOXES: Row with no labelled words blk:%d row:%d allrows:%d\n",
          block_idx, row_idx, all_row_idx);
      }
      else
        ok_row_count++;
    }
  }

  min_samples = 9999;
  for (i = 0; i < 128; i++) {
    if (tgt_char_counts[i] > labelled_char_counts[i]) {
      if (labelled_char_counts[i] <= 1) {
        tprintf
          ("APPLY_BOXES: FATALITY - %d labelled samples of \"%c\" - target is %d\n",
          labelled_char_counts[i], (char) i, tgt_char_counts[i]);
      }
      else {
        rebalance_needed = TRUE;
        if (applybox_debug > 0)
          tprintf
            ("APPLY_BOXES: REBALANCE REQD \"%c\" - target of %d from %d labelled samples\n",
            (char) i, tgt_char_counts[i], labelled_char_counts[i]);
      }
    }
    if ((min_samples > labelled_char_counts[i]) && (tgt_char_counts[i] > 0)) {
      min_samples = labelled_char_counts[i];
      min_char = (char) i;
    }
  }

  while (applybox_rebalance && rebalance_needed) {
    block_it.set_to_list (block_list);
    for (block_it.mark_cycle_pt ();
    !block_it.cycled_list (); block_it.forward ()) {
      row_it.set_to_list (block_it.data ()->row_list ());
      for (row_it.mark_cycle_pt ();
      !row_it.cycled_list (); row_it.forward ()) {
        row = row_it.data ();
        word_it.set_to_list (row->word_list ());
        for (word_it.mark_cycle_pt ();
        !word_it.cycled_list (); word_it.forward ()) {
          word = word_it.data ();
          left = word->bounding_box ().left ();
          ch = *word->text ();
          at_dupe_of_prev_word = ((row == prev_row) &&
            (left = prev_left) &&
            (ch == prev_ch));
          if ((ch != '\0') &&
            (labelled_char_counts[ch] > 1) &&
            (tgt_char_counts[ch] > labelled_char_counts[ch]) &&
          (!at_dupe_of_prev_word)) {
            /* Duplicate the word to rebalance the labelled samples */
            if (applybox_debug > 9) {
              tprintf ("Duping \"%c\" from ", ch);
              word->bounding_box ().print ();
            }
            duplicate_word = new WERD;
            *duplicate_word = *word;
            word_it.add_after_then_move (duplicate_word);
            rebalance_count++;
            labelled_char_counts[ch]++;
          }
          prev_row = row;
          prev_left = left;
          prev_ch = ch;
        }
      }
    }
    rebalance_needed = FALSE;
    for (i = 0; i < 128; i++) {
      if ((tgt_char_counts[i] > labelled_char_counts[i]) &&
      (labelled_char_counts[i] > 1)) {
        rebalance_needed = TRUE;
        break;
      }
    }
  }

  /* Now final check - count labelled blobs */
  final_labelled_blob_count = 0;
  block_it.set_to_list (block_list);
  for (block_it.mark_cycle_pt ();
  !block_it.cycled_list (); block_it.forward ()) {
    row_it.set_to_list (block_it.data ()->row_list ());
    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
      row = row_it.data ();
      word_it.set_to_list (row->word_list ());
      word_it.sort (word_comparator);
      for (word_it.mark_cycle_pt ();
      !word_it.cycled_list (); word_it.forward ()) {
        word = word_it.data ();
        if ((strlen (word->text ()) == 1) &&
          (word->gblob_list ()->length () == 1))
          final_labelled_blob_count++;
      }
    }
  }
}
Пример #6
0
INT16 resegment_box(  //
                    ROW *row,
                    BOX box,
                    char *ch,
                    INT16 block_id,
                    INT16 row_id,
                    INT16 boxfile_lineno,
                    INT16 boxfile_charno) {
  WERD_IT word_it;
  WERD *word;
  WERD *new_word = NULL;
  BOOL8 polyg = false;
  PBLOB_IT blob_it;
  PBLOB_IT new_blob_it;
  PBLOB *blob;
  PBLOB *new_blob;
  OUTLINE_IT outline_it;
  OUTLINE_LIST dummy;  // Just to initialize new_outline_it.
  OUTLINE_IT new_outline_it = &dummy;
  OUTLINE *outline;
  BOX new_word_box;
  float word_x_centre;
  float baseline;
  INT16 error_count = 0;         //number of chars lost

  word_it.set_to_list (row->word_list ());
  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
    word = word_it.data ();
    polyg = word->flag (W_POLYGON);
    if (word->bounding_box ().overlap (box)) {
      blob_it.set_to_list (word->gblob_list ());
      for (blob_it.mark_cycle_pt ();
      !blob_it.cycled_list (); blob_it.forward ()) {
        blob = blob_it.data ();
        if (gblob_bounding_box (blob, polyg).overlap (box)) {
          outline_it.set_to_list (gblob_out_list (blob, polyg));
          for (outline_it.mark_cycle_pt ();
          !outline_it.cycled_list (); outline_it.forward ()) {
            outline = outline_it.data ();
            if (goutline_bounding_box (outline, polyg).
            major_overlap (box)) {
              if (strlen (word->text ()) > 0) {
                if (error_count == 0) {
                  error_count = 1;
                  if (applybox_debug > 4)
                    report_failed_box (boxfile_lineno,
                      boxfile_charno,
                      box, ch,
                      "FAILURE! box overlaps blob in labelled word");
                }
                if (applybox_debug > 4)
                  tprintf
                    ("APPLY_BOXES: ALSO ignoring corrupted char blk:%d row:%d \"%s\"\n",
                    block_id, row_id,
                    word_it.data ()->text ());
                word_it.data ()->set_text ("");
                //UN label it
                error_count++;
              }

              if (error_count == 0) {
                if (new_word == NULL) {
                                 /* Make a new word with a single blob */
                  new_word = word->shallow_copy ();
                  new_word->set_text (ch);
                  if (polyg)
                    new_blob = new PBLOB;
                  else
                    new_blob = (PBLOB *) new C_BLOB;
                  new_blob_it.set_to_list (new_word->
                    gblob_list ());
                  new_blob_it.add_to_end (new_blob);
                  new_outline_it.
                    set_to_list (gblob_out_list
                    (new_blob, polyg));
                }
                new_outline_it.add_to_end (outline_it.
                  extract ());
                //move blob
              }
            }
          }
                                 //no outlines in blob
          if (outline_it.empty ())
                                 //so delete blob
            delete blob_it.extract ();
        }
      }
      if (blob_it.empty ())      //no blobs in word
                                 //so delete word
          delete word_it.extract ();
    }
  }
  if (error_count > 0)
    return error_count;

  if (new_word != NULL) {
    gblob_sort_list (new_word->gblob_list (), polyg);
    word_it.add_to_end (new_word);
    new_word_box = new_word->bounding_box ();
    word_x_centre = (new_word_box.left () + new_word_box.right ()) / 2.0f;
    baseline = row->base_line (word_x_centre);

    if (STRING (chs_caps_ht).contains (ch[0]) &&
      (new_word_box.top () <
    baseline + (1 + applybox_error_band) * row->x_height ())) {
      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
        "FAILURE! caps-ht char didn't ascend");
      new_word->set_text ("");
      return 1;
    }
    if (STRING (chs_odd_top).contains (ch[0]) &&
      (new_word_box.top () <
    baseline + (1 - applybox_error_band) * row->x_height ())) {
      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
        "FAILURE! Odd top char below xht");
      new_word->set_text ("");
      return 1;
    }
    if (STRING (chs_x_ht).contains (ch[0]) &&
      ((new_word_box.top () >
      baseline + (1 + applybox_error_band) * row->x_height ()) ||
      (new_word_box.top () <
    baseline + (1 - applybox_error_band) * row->x_height ()))) {
      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
        "FAILURE! x-ht char didn't have top near xht");
      new_word->set_text ("");
      return 1;
    }
    if (STRING (chs_non_ambig_bl).contains (ch[0]) &&
      ((new_word_box.bottom () <
      baseline - applybox_error_band * row->x_height ()) ||
      (new_word_box.bottom () >
    baseline + applybox_error_band * row->x_height ()))) {
      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
        "FAILURE! non ambig BL char didnt have bottom near baseline");
      new_word->set_text ("");
      return 1;
    }
    if (STRING (chs_odd_bot).contains (ch[0]) &&
      (new_word_box.bottom () >
    baseline + applybox_error_band * row->x_height ())) {
      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
        "FAILURE! Odd bottom char above baseline");
      new_word->set_text ("");
      return 1;
    }
    if (STRING (chs_desc).contains (ch[0]) &&
      (new_word_box.bottom () >
    baseline - applybox_error_band * row->x_height ())) {
      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
        "FAILURE! Descender doesn't descend");
      new_word->set_text ("");
      return 1;
    }
    return 0;
  }
  else {
    report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
      "FAILURE! Couldn't find any blobs");
    return 1;
  }
}
Пример #7
0
/// Consume all source blobs that strongly overlap the given box,
/// putting them into a new word, with the correct_text label.
/// Fights over which box owns which blobs are settled by
/// applying the blobs to box or next_box with the least non-overlap.
/// @return false if the box was in error, which can only be caused by
/// failing to find an overlapping blob for a box.
bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
                                 const TBOX& box, const TBOX& next_box,
                                 const char* correct_text) {
  if (applybox_debug > 1) {
    tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
  }
  WERD* new_word = NULL;
  BLOCK_IT b_it(block_list);
  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
    BLOCK* block = b_it.data();
    if (!box.major_overlap(block->bounding_box()))
      continue;
    ROW_IT r_it(block->row_list());
    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
      ROW* row = r_it.data();
      if (!box.major_overlap(row->bounding_box()))
        continue;
      WERD_IT w_it(row->word_list());
      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
        WERD* word = w_it.data();
        if (applybox_debug > 2) {
          tprintf("Checking word:");
          word->bounding_box().print();
        }
        if (word->text() != NULL && word->text()[0] != '\0')
          continue;  // Ignore words that are already done.
        if (!box.major_overlap(word->bounding_box()))
          continue;
        C_BLOB_IT blob_it(word->cblob_list());
        for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
             blob_it.forward()) {
          C_BLOB* blob = blob_it.data();
          TBOX blob_box = blob->bounding_box();
          if (!blob_box.major_overlap(box))
            continue;
          double current_box_miss_metric = BoxMissMetric(blob_box, box);
          double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
          if (applybox_debug > 2) {
            tprintf("Checking blob:");
            blob_box.print();
            tprintf("Current miss metric = %g, next = %g\n",
                    current_box_miss_metric, next_box_miss_metric);
          }
          if (current_box_miss_metric > next_box_miss_metric)
            continue;  // Blob is a better match for next box.
          if (applybox_debug > 2) {
            tprintf("Blob match: blob:");
            blob_box.print();
            tprintf("Matches box:");
            box.print();
            tprintf("With next box:");
            next_box.print();
          }
          if (new_word == NULL) {
            // Make a new word with a single blob.
            new_word = word->shallow_copy();
            new_word->set_text(correct_text);
            w_it.add_to_end(new_word);
          }
          C_BLOB_IT new_blob_it(new_word->cblob_list());
          new_blob_it.add_to_end(blob_it.extract());
        }
      }
    }
  }
  if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n");
  return new_word != NULL;
}