示例#1
0
// Applies the box file based on the image name fname, and resegments
// the words in the block_list (page), with:
// blob-mode: one blob per line in the box file, words as input.
// word/line-mode: one blob per space-delimited unit after the #, and one word
// per line in the box file. (See comment above for box file format.)
// If find_segmentation is true, (word/line mode) then the classifier is used
// to re-segment words/lines to match the space-delimited truth string for
// each box. In this case, the input box may be for a word or even a whole
// text line, and the output words will contain multiple blobs corresponding
// to the space-delimited input string.
// With find_segmentation false, no classifier is needed, but the chopper
// can still be used to correctly segment touching characters with the help
// of the input boxes.
// In the returned PAGE_RES, the WERD_RES are setup as they would be returned
// from normal classification, ie. with a word, chopped_word, rebuild_word,
// seam_array, denorm, box_word, and best_state, but NO best_choice or
// raw_choice, as they would require a UNICHARSET, which we aim to avoid.
// Instead, the correct_text member of WERD_RES is set, and this may be later
// converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
// is not required before calling ApplyBoxTraining.
PAGE_RES* Tesseract::ApplyBoxes(const STRING& fname,
                                bool find_segmentation,
                                BLOCK_LIST *block_list) {
  GenericVector<TBOX> boxes;
  GenericVector<STRING> texts, full_texts;
  if (!ReadAllBoxes(applybox_page, true, fname, &boxes, &texts, &full_texts,
                    NULL)) {
    return NULL;  // Can't do it.
  }

  int box_count = boxes.size();
  int box_failures = 0;
  // Add an empty everything to the end.
  boxes.push_back(TBOX());
  texts.push_back(STRING());
  full_texts.push_back(STRING());

  // In word mode, we use the boxes to make a word for each box, but
  // in blob mode we use the existing words and maximally chop them first.
  PAGE_RES* page_res = find_segmentation ?
      NULL : SetupApplyBoxes(boxes, block_list);
  clear_any_old_text(block_list);

  for (int i = 0; i < boxes.size() - 1; i++) {
    bool foundit = false;
    if (page_res != NULL) {
      if (i == 0) {
        foundit = ResegmentCharBox(page_res, NULL, boxes[i], boxes[i + 1],
                                   full_texts[i].string());
      } else {
        foundit = ResegmentCharBox(page_res, &boxes[i-1], boxes[i],
                                   boxes[i + 1], full_texts[i].string());
      }
    } else {
      foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1],
                                 texts[i].string());
    }
    if (!foundit) {
      box_failures++;
      ReportFailedBox(i, boxes[i], texts[i].string(),
                      "FAILURE! Couldn't find a matching blob");
    }
  }

  if (page_res == NULL) {
    // In word/line mode, we now maximally chop all the words and resegment
    // them with the classifier.
    page_res = SetupApplyBoxes(boxes, block_list);
    ReSegmentByClassification(page_res);
  }
  if (applybox_debug > 0) {
    tprintf("APPLY_BOXES:\n");
    tprintf("   Boxes read from boxfile:  %6d\n", box_count);
    if (box_failures > 0)
      tprintf("   Boxes failed resegmentation:  %6d\n", box_failures);
  }
  TidyUp(page_res);
  return page_res;
}
示例#2
0
void apply_boxes(BLOCK_LIST *block_list    //real blocks
                ) {
  INT16 boxfile_lineno = 0;
  INT16 boxfile_charno = 0;
  BOX box;                       //boxfile box
  char ch[2];                    //correct ch from boxfile
  ROW *row;
  ROW *prev_row = NULL;
  INT16 prev_box_right = MAX_INT16;
  INT16 block_id;
  INT16 row_id;
  INT16 box_count = 0;
  INT16 box_failures = 0;
  INT16 labels_ok;
  INT16 rows_ok;
  INT16 bad_blobs;
  INT16 tgt_char_counts[128];    //No. of box samples
  //      INT16                                   labelled_char_counts[128];      //No. of unique labelled samples
  INT16 i;
  INT16 rebalance_count = 0;
  char min_char;
  INT16 min_samples;
  INT16 final_labelled_blob_count;

  for (i = 0; i < 128; i++)
    tgt_char_counts[i] = 0;

  FILE* box_file;
  STRING filename = imagefile;
  filename += ".box";
  if (!(box_file = fopen (filename.string(), "r"))) {
    CANTOPENFILE.error ("read_next_box", EXIT,
      "Cant open box file %s %d",
      filename.string(), errno);
  }

  ch[1] = '\0';
  clear_any_old_text(block_list);
  while (read_next_box (box_file, &box, &ch[0])) {
    box_count++;
    tgt_char_counts[ch[0]]++;
    row = find_row_of_box (block_list, box, block_id, row_id);
    if (box.left () < prev_box_right) {
      boxfile_lineno++;
      boxfile_charno = 1;
    }
    else
      boxfile_charno++;

    if (row == NULL) {
      box_failures++;
      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
        "FAILURE! box overlaps no blobs or blobs in multiple rows");
    }
    else {
      if ((box.left () >= prev_box_right) && (row != prev_row))
        report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
          "WARNING! false row break");
      box_failures += resegment_box (row, box, ch, block_id, row_id,
        boxfile_lineno, boxfile_charno);
      prev_row = row;
    }
    prev_box_right = box.right ();
  }
  tidy_up(block_list,
          labels_ok,
          rows_ok,
          bad_blobs,
          tgt_char_counts,
          rebalance_count,
          min_char,
          min_samples,
          final_labelled_blob_count);
  tprintf ("APPLY_BOXES:\n");
  tprintf ("   Boxes read from boxfile:  %6d\n", box_count);
  tprintf ("   Initially labelled blobs: %6d in %d rows\n",
    labels_ok, rows_ok);
  tprintf ("   Box failures detected:		%6d\n", box_failures);
  tprintf ("   Duped blobs for rebalance:%6d\n", rebalance_count);
  tprintf ("   \"%c\" has fewest samples:%6d\n", min_char, min_samples);
  tprintf ("				Total unlabelled words:   %6d\n",
    bad_blobs);
  tprintf ("				Final labelled words:     %6d\n",
    final_labelled_blob_count);
}