// Applies the box file based on the image name fname, and resegments // the words in the block_list (page), with: // blob-mode: one blob per line in the box file, words as input. // word/line-mode: one blob per space-delimited unit after the #, and one word // per line in the box file. (See comment above for box file format.) // If find_segmentation is true, (word/line mode) then the classifier is used // to re-segment words/lines to match the space-delimited truth string for // each box. In this case, the input box may be for a word or even a whole // text line, and the output words will contain multiple blobs corresponding // to the space-delimited input string. // With find_segmentation false, no classifier is needed, but the chopper // can still be used to correctly segment touching characters with the help // of the input boxes. // In the returned PAGE_RES, the WERD_RES are setup as they would be returned // from normal classification, ie. with a word, chopped_word, rebuild_word, // seam_array, denorm, box_word, and best_state, but NO best_choice or // raw_choice, as they would require a UNICHARSET, which we aim to avoid. // Instead, the correct_text member of WERD_RES is set, and this may be later // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords // is not required before calling ApplyBoxTraining. PAGE_RES* Tesseract::ApplyBoxes(const STRING& fname, bool find_segmentation, BLOCK_LIST *block_list) { GenericVector<TBOX> boxes; GenericVector<STRING> texts, full_texts; if (!ReadAllBoxes(applybox_page, true, fname, &boxes, &texts, &full_texts, NULL)) { return NULL; // Can't do it. } int box_count = boxes.size(); int box_failures = 0; // Add an empty everything to the end. boxes.push_back(TBOX()); texts.push_back(STRING()); full_texts.push_back(STRING()); // In word mode, we use the boxes to make a word for each box, but // in blob mode we use the existing words and maximally chop them first. PAGE_RES* page_res = find_segmentation ? NULL : SetupApplyBoxes(boxes, block_list); clear_any_old_text(block_list); for (int i = 0; i < boxes.size() - 1; i++) { bool foundit = false; if (page_res != NULL) { if (i == 0) { foundit = ResegmentCharBox(page_res, NULL, boxes[i], boxes[i + 1], full_texts[i].string()); } else { foundit = ResegmentCharBox(page_res, &boxes[i-1], boxes[i], boxes[i + 1], full_texts[i].string()); } } else { foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1], texts[i].string()); } if (!foundit) { box_failures++; ReportFailedBox(i, boxes[i], texts[i].string(), "FAILURE! Couldn't find a matching blob"); } } if (page_res == NULL) { // In word/line mode, we now maximally chop all the words and resegment // them with the classifier. page_res = SetupApplyBoxes(boxes, block_list); ReSegmentByClassification(page_res); } if (applybox_debug > 0) { tprintf("APPLY_BOXES:\n"); tprintf(" Boxes read from boxfile: %6d\n", box_count); if (box_failures > 0) tprintf(" Boxes failed resegmentation: %6d\n", box_failures); } TidyUp(page_res); return page_res; }
void apply_boxes(BLOCK_LIST *block_list //real blocks ) { INT16 boxfile_lineno = 0; INT16 boxfile_charno = 0; BOX box; //boxfile box char ch[2]; //correct ch from boxfile ROW *row; ROW *prev_row = NULL; INT16 prev_box_right = MAX_INT16; INT16 block_id; INT16 row_id; INT16 box_count = 0; INT16 box_failures = 0; INT16 labels_ok; INT16 rows_ok; INT16 bad_blobs; INT16 tgt_char_counts[128]; //No. of box samples // INT16 labelled_char_counts[128]; //No. of unique labelled samples INT16 i; INT16 rebalance_count = 0; char min_char; INT16 min_samples; INT16 final_labelled_blob_count; for (i = 0; i < 128; i++) tgt_char_counts[i] = 0; FILE* box_file; STRING filename = imagefile; filename += ".box"; if (!(box_file = fopen (filename.string(), "r"))) { CANTOPENFILE.error ("read_next_box", EXIT, "Cant open box file %s %d", filename.string(), errno); } ch[1] = '\0'; clear_any_old_text(block_list); while (read_next_box (box_file, &box, &ch[0])) { box_count++; tgt_char_counts[ch[0]]++; row = find_row_of_box (block_list, box, block_id, row_id); if (box.left () < prev_box_right) { boxfile_lineno++; boxfile_charno = 1; } else boxfile_charno++; if (row == NULL) { box_failures++; report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "FAILURE! box overlaps no blobs or blobs in multiple rows"); } else { if ((box.left () >= prev_box_right) && (row != prev_row)) report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "WARNING! false row break"); box_failures += resegment_box (row, box, ch, block_id, row_id, boxfile_lineno, boxfile_charno); prev_row = row; } prev_box_right = box.right (); } tidy_up(block_list, labels_ok, rows_ok, bad_blobs, tgt_char_counts, rebalance_count, min_char, min_samples, final_labelled_blob_count); tprintf ("APPLY_BOXES:\n"); tprintf (" Boxes read from boxfile: %6d\n", box_count); tprintf (" Initially labelled blobs: %6d in %d rows\n", labels_ok, rows_ok); tprintf (" Box failures detected: %6d\n", box_failures); tprintf (" Duped blobs for rebalance:%6d\n", rebalance_count); tprintf (" \"%c\" has fewest samples:%6d\n", min_char, min_samples); tprintf (" Total unlabelled words: %6d\n", bad_blobs); tprintf (" Final labelled words: %6d\n", final_labelled_blob_count); }