Пример #1
0
// Reads the next box from the given box file into TBOX.
bool read_b(int applybox_page, int *line_number, FILE *box_file,
            char *label, TBOX *bbox) {
  int x_min, y_min, x_max, y_max;
  if (read_next_box(applybox_page, line_number, box_file, label,
                    &x_min, &y_min, &x_max, &y_max)) {
    bbox->set_to_given_coords(x_min, y_min, x_max, y_max);
    return true;
  } else {
    return false;
  }
}
Пример #2
0
void apply_boxes(BLOCK_LIST *block_list    //real blocks
                ) {
  INT16 boxfile_lineno = 0;
  INT16 boxfile_charno = 0;
  BOX box;                       //boxfile box
  char ch[2];                    //correct ch from boxfile
  ROW *row;
  ROW *prev_row = NULL;
  INT16 prev_box_right = MAX_INT16;
  INT16 block_id;
  INT16 row_id;
  INT16 box_count = 0;
  INT16 box_failures = 0;
  INT16 labels_ok;
  INT16 rows_ok;
  INT16 bad_blobs;
  INT16 tgt_char_counts[128];    //No. of box samples
  //      INT16                                   labelled_char_counts[128];      //No. of unique labelled samples
  INT16 i;
  INT16 rebalance_count = 0;
  char min_char;
  INT16 min_samples;
  INT16 final_labelled_blob_count;

  for (i = 0; i < 128; i++)
    tgt_char_counts[i] = 0;

  FILE* box_file;
  STRING filename = imagefile;
  filename += ".box";
  if (!(box_file = fopen (filename.string(), "r"))) {
    CANTOPENFILE.error ("read_next_box", EXIT,
      "Cant open box file %s %d",
      filename.string(), errno);
  }

  ch[1] = '\0';
  clear_any_old_text(block_list);
  while (read_next_box (box_file, &box, &ch[0])) {
    box_count++;
    tgt_char_counts[ch[0]]++;
    row = find_row_of_box (block_list, box, block_id, row_id);
    if (box.left () < prev_box_right) {
      boxfile_lineno++;
      boxfile_charno = 1;
    }
    else
      boxfile_charno++;

    if (row == NULL) {
      box_failures++;
      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
        "FAILURE! box overlaps no blobs or blobs in multiple rows");
    }
    else {
      if ((box.left () >= prev_box_right) && (row != prev_row))
        report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
          "WARNING! false row break");
      box_failures += resegment_box (row, box, ch, block_id, row_id,
        boxfile_lineno, boxfile_charno);
      prev_row = row;
    }
    prev_box_right = box.right ();
  }
  tidy_up(block_list,
          labels_ok,
          rows_ok,
          bad_blobs,
          tgt_char_counts,
          rebalance_count,
          min_char,
          min_samples,
          final_labelled_blob_count);
  tprintf ("APPLY_BOXES:\n");
  tprintf ("   Boxes read from boxfile:  %6d\n", box_count);
  tprintf ("   Initially labelled blobs: %6d in %d rows\n",
    labels_ok, rows_ok);
  tprintf ("   Box failures detected:		%6d\n", box_failures);
  tprintf ("   Duped blobs for rebalance:%6d\n", rebalance_count);
  tprintf ("   \"%c\" has fewest samples:%6d\n", min_char, min_samples);
  tprintf ("				Total unlabelled words:   %6d\n",
    bad_blobs);
  tprintf ("				Final labelled words:     %6d\n",
    final_labelled_blob_count);
}
Пример #3
0
// Box files are used ONLY DURING TRAINING, but by both processes of
// creating tr files with tesseract, and unicharset_extractor.
// read_next_box factors out the code to interpret a line of a box
// file so that applybox and unicharset_extractor interpret the same way.
// This function returns the next valid box file utf8 string and coords
// and returns true, or false on eof (and closes the file).
// It ignores the uft8 file signature, checks for valid utf-8 and allows
// space or tab between fields.
// utf8_str must be at least kBoxReadBufSize in length.
// If there are page numbers in the file, it reads them all.
bool read_next_box(int *line_number, FILE* box_file, char* utf8_str,
                   int* x_min, int* y_min, int* x_max, int* y_max) {
  return read_next_box(-1, line_number, box_file, utf8_str,
                       x_min, y_min, x_max, y_max);
}
Пример #4
0
bool read_next_box(FILE* box_file, char* utf8_str,
                   int* x_min, int* y_min, int* x_max, int* y_max) {
  return read_next_box(-1, box_file, utf8_str,
                       x_min, y_min, x_max, y_max);
}