// Reads the next box from the given box file into TBOX. bool read_b(int applybox_page, int *line_number, FILE *box_file, char *label, TBOX *bbox) { int x_min, y_min, x_max, y_max; if (read_next_box(applybox_page, line_number, box_file, label, &x_min, &y_min, &x_max, &y_max)) { bbox->set_to_given_coords(x_min, y_min, x_max, y_max); return true; } else { return false; } }
void apply_boxes(BLOCK_LIST *block_list //real blocks ) { INT16 boxfile_lineno = 0; INT16 boxfile_charno = 0; BOX box; //boxfile box char ch[2]; //correct ch from boxfile ROW *row; ROW *prev_row = NULL; INT16 prev_box_right = MAX_INT16; INT16 block_id; INT16 row_id; INT16 box_count = 0; INT16 box_failures = 0; INT16 labels_ok; INT16 rows_ok; INT16 bad_blobs; INT16 tgt_char_counts[128]; //No. of box samples // INT16 labelled_char_counts[128]; //No. of unique labelled samples INT16 i; INT16 rebalance_count = 0; char min_char; INT16 min_samples; INT16 final_labelled_blob_count; for (i = 0; i < 128; i++) tgt_char_counts[i] = 0; FILE* box_file; STRING filename = imagefile; filename += ".box"; if (!(box_file = fopen (filename.string(), "r"))) { CANTOPENFILE.error ("read_next_box", EXIT, "Cant open box file %s %d", filename.string(), errno); } ch[1] = '\0'; clear_any_old_text(block_list); while (read_next_box (box_file, &box, &ch[0])) { box_count++; tgt_char_counts[ch[0]]++; row = find_row_of_box (block_list, box, block_id, row_id); if (box.left () < prev_box_right) { boxfile_lineno++; boxfile_charno = 1; } else boxfile_charno++; if (row == NULL) { box_failures++; report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "FAILURE! box overlaps no blobs or blobs in multiple rows"); } else { if ((box.left () >= prev_box_right) && (row != prev_row)) report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "WARNING! false row break"); box_failures += resegment_box (row, box, ch, block_id, row_id, boxfile_lineno, boxfile_charno); prev_row = row; } prev_box_right = box.right (); } tidy_up(block_list, labels_ok, rows_ok, bad_blobs, tgt_char_counts, rebalance_count, min_char, min_samples, final_labelled_blob_count); tprintf ("APPLY_BOXES:\n"); tprintf (" Boxes read from boxfile: %6d\n", box_count); tprintf (" Initially labelled blobs: %6d in %d rows\n", labels_ok, rows_ok); tprintf (" Box failures detected: %6d\n", box_failures); tprintf (" Duped blobs for rebalance:%6d\n", rebalance_count); tprintf (" \"%c\" has fewest samples:%6d\n", min_char, min_samples); tprintf (" Total unlabelled words: %6d\n", bad_blobs); tprintf (" Final labelled words: %6d\n", final_labelled_blob_count); }
// Box files are used ONLY DURING TRAINING, but by both processes of // creating tr files with tesseract, and unicharset_extractor. // read_next_box factors out the code to interpret a line of a box // file so that applybox and unicharset_extractor interpret the same way. // This function returns the next valid box file utf8 string and coords // and returns true, or false on eof (and closes the file). // It ignores the uft8 file signature, checks for valid utf-8 and allows // space or tab between fields. // utf8_str must be at least kBoxReadBufSize in length. // If there are page numbers in the file, it reads them all. bool read_next_box(int *line_number, FILE* box_file, char* utf8_str, int* x_min, int* y_min, int* x_max, int* y_max) { return read_next_box(-1, line_number, box_file, utf8_str, x_min, y_min, x_max, y_max); }
bool read_next_box(FILE* box_file, char* utf8_str, int* x_min, int* y_min, int* x_max, int* y_max) { return read_next_box(-1, box_file, utf8_str, x_min, y_min, x_max, y_max); }