int main(int argc, char** argv) { int option; const char* output_directory = "."; STRING unicharset_file_name; // Special characters are now included by default. UNICHARSET unicharset; setlocale(LC_ALL, ""); // Print usage if (argc <= 1) { printf("Usage: %s [-D DIRECTORY] FILE...\n", argv[0]); exit(1); } // Parse arguments while ((option = tessopt(argc, argv, "D" )) != EOF) { switch (option) { case 'D': output_directory = tessoptarg; ++tessoptind; break; } } // Save file name unicharset_file_name = output_directory; unicharset_file_name += "/"; unicharset_file_name += kUnicharsetFileName; // Load box files for (; tessoptind < argc; ++tessoptind) { printf("Extracting unicharset from %s\n", argv[tessoptind]); FILE* box_file = fopen(argv[tessoptind], "rb"); if (box_file == NULL) { printf("Cannot open box file %s\n", argv[tessoptind]); return -1; } TBOX box; STRING unichar_string; int line_number = 0; while (ReadNextBox(&line_number, box_file, &unichar_string, &box)) { unicharset.unichar_insert(unichar_string.string()); set_properties(&unicharset, unichar_string.string()); } } // Write unicharset file if (unicharset.save_to_file(unicharset_file_name.string())) { printf("Wrote unicharset file %s.\n", unicharset_file_name.string()); } else { printf("Cannot save unicharset file %s.\n", unicharset_file_name.string()); return -1; } return 0; }
// This function takes tif/box pair of files and runs recognition on the image, // while making sure that the word bounds that tesseract identified roughly // match to those specified by the input box file. For each word (ngram in a // single bounding box from the input box file) it outputs the ocred result, // the correct label, rating and certainty. void Tesseract::recog_training_segmented(const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file) { STRING box_fname = fname; const char *lastdot = strrchr(box_fname.string(), '.'); if (lastdot != NULL) box_fname[lastdot - box_fname.string()] = '\0'; box_fname += ".box"; // ReadNextBox() will close box_file FILE *box_file = open_file(box_fname.string(), "r"); PAGE_RES_IT page_res_it; page_res_it.page_res = page_res; page_res_it.restart_page(); STRING label; // Process all the words on this page. TBOX tbox; // tesseract-identified box TBOX bbox; // box from the box file bool keep_going; int line_number = 0; int examined_words = 0; do { keep_going = read_t(&page_res_it, &tbox); keep_going &= ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox); // Align bottom left points of the TBOXes. while (keep_going && !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) { if (bbox.bottom() < tbox.bottom()) { page_res_it.forward(); keep_going = read_t(&page_res_it, &tbox); } else { keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox); } } while (keep_going && !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) { if (bbox.left() > tbox.left()) { page_res_it.forward(); keep_going = read_t(&page_res_it, &tbox); } else { keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox); } } // OCR the word if top right points of the TBOXes are similar. if (keep_going && NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) && NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) { ambigs_classify_and_output(label.string(), &page_res_it, output_file); examined_words++; } page_res_it.forward(); } while (keep_going); // Set up scripts on all of the words that did not get sent to // ambigs_classify_and_output. They all should have, but if all the // werd_res's don't get uch_sets, tesseract will crash when you try // to iterate over them. :-( int total_words = 0; for (page_res_it.restart_page(); page_res_it.block() != NULL; page_res_it.forward()) { if (page_res_it.word()) { if (page_res_it.word()->uch_set == NULL) page_res_it.word()->SetupFake(unicharset); total_words++; } } if (examined_words < 0.85 * total_words) { tprintf("TODO(antonova): clean up recog_training_segmented; " " It examined only a small fraction of the ambigs image.\n"); } tprintf("recog_training_segmented: examined %d / %d words.\n", examined_words, total_words); }
// TODO(rays) convert all uses of ReadNextBox to use the new ReadAllBoxes. // Box files are used ONLY DURING TRAINING, but by both processes of // creating tr files with tesseract, and unicharset_extractor. // ReadNextBox factors out the code to interpret a line of a box // file so that applybox and unicharset_extractor interpret the same way. // This function returns the next valid box file utf8 string and coords // and returns true, or false on eof (and closes the file). // It ignores the utf8 file signature ByteOrderMark (U+FEFF=EF BB BF), checks // for valid utf-8 and allows space or tab between fields. // utf8_str is set with the unichar string, and bounding box with the box. // If there are page numbers in the file, it reads them all. bool ReadNextBox(int *line_number, FILE* box_file, STRING* utf8_str, TBOX* bounding_box) { return ReadNextBox(-1, line_number, box_file, utf8_str, bounding_box); }