// Helper to set the properties for an input unicharset file, writes to the // output file. If an appropriate script unicharset can be found in the // script_dir directory, then the tops and bottoms are expanded using the // script unicharset. // If non-empty, xheight data for the fonts are written to the xheights_file. void SetPropertiesForInputFile(const std::string& script_dir, const std::string& input_unicharset_file, const std::string& output_unicharset_file, const std::string& output_xheights_file) { UNICHARSET unicharset; // Load the input unicharset unicharset.load_from_file(input_unicharset_file.c_str()); tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(), input_unicharset_file.c_str()); // Set unichar properties tprintf("Setting unichar properties\n"); SetupBasicProperties(true, false, &unicharset); tprintf("Setting script properties\n"); SetScriptProperties(script_dir, &unicharset); if (!output_xheights_file.empty()) { std::string xheights_str = GetXheightString(script_dir, unicharset); File::WriteStringToFileOrDie(xheights_str, output_xheights_file); } // Write the output unicharset tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str()); unicharset.save_to_file(output_unicharset_file.c_str()); }
// Helper to set the properties for an input unicharset file, writes to the // output file. If an appropriate script unicharset can be found in the // script_dir directory, then the tops and bottoms are expanded using the // script unicharset. // If non-empty, xheight data for the fonts are written to the xheights_file. void SetPropertiesForInputFile(const string& script_dir, const string& input_unicharset_file, const string& output_unicharset_file, const string& output_xheights_file) { UNICHARSET unicharset; // Load the input unicharset unicharset.load_from_file(input_unicharset_file.c_str()); tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(), input_unicharset_file.c_str()); // Set unichar properties tprintf("Setting unichar properties\n"); SetupBasicProperties(true, false, &unicharset); string xheights_str; for (int s = 0; s < unicharset.get_script_table_size(); ++s) { // Load the unicharset for the script if available. string filename = script_dir + "/" + unicharset.get_script_from_script_id(s) + ".unicharset"; UNICHARSET script_set; if (script_set.load_from_file(filename.c_str())) { unicharset.SetPropertiesFromOther(script_set); } // Load the xheights for the script if available. filename = script_dir + "/" + unicharset.get_script_from_script_id(s) + ".xheights"; string script_heights; if (File::ReadFileToString(filename, &script_heights)) xheights_str += script_heights; } if (!output_xheights_file.empty()) File::WriteStringToFileOrDie(xheights_str, output_xheights_file); for (int c = SPECIAL_UNICHAR_CODES_COUNT; c < unicharset.size(); ++c) { if (unicharset.PropertiesIncomplete(c)) { tprintf("Warning: properties incomplete for index %d = %s\n", c, unicharset.id_to_unichar(c)); } } // Write the output unicharset tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str()); unicharset.save_to_file(output_unicharset_file.c_str()); }
int main(int argc, char** argv) { // Sets properties on the input unicharset file, and writes: // rootdir/lang/lang.charset_size=ddd.txt // rootdir/lang/lang.traineddata // rootdir/lang/lang.unicharset // If the 3 word lists are provided, the dawgs are also added // to the traineddata file. // The output unicharset and charset_size files are just for // human readability. tesseract::CheckSharedLibraryVersion(); tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true); GenericVector<STRING> words, puncs, numbers; // If these reads fail, we get a warning message and an empty list of words. tesseract::ReadFile(FLAGS_words.c_str(), nullptr).split('\n', &words); tesseract::ReadFile(FLAGS_puncs.c_str(), nullptr).split('\n', &puncs); tesseract::ReadFile(FLAGS_numbers.c_str(), nullptr).split('\n', &numbers); // Load the input unicharset UNICHARSET unicharset; if (!unicharset.load_from_file(FLAGS_input_unicharset.c_str(), false)) { tprintf("Failed to load unicharset from %s\n", FLAGS_input_unicharset.c_str()); return 1; } tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(), FLAGS_input_unicharset.c_str()); // Set unichar properties tprintf("Setting unichar properties\n"); tesseract::SetupBasicProperties(/*report_errors*/ true, /*decompose (NFD)*/ false, &unicharset); tprintf("Setting script properties\n"); tesseract::SetScriptProperties(FLAGS_script_dir.c_str(), &unicharset); // Combine everything into a traineddata file. return tesseract::CombineLangModel( unicharset, FLAGS_script_dir.c_str(), FLAGS_version_str.c_str(), FLAGS_output_dir.c_str(), FLAGS_lang.c_str(), FLAGS_pass_through_recoder, words, puncs, numbers, FLAGS_lang_is_rtl, /*reader*/ nullptr, /*writer*/ nullptr); }
int main(int argc, char *argv[]) { if (argc != 4) { tprintf("Print all the words in a given dawg.\n"); tprintf("Usage: %s <unicharset> <dawgfile> <wordlistfile>\n", argv[0]); return 1; } const char *unicharset_file = argv[1]; const char *dawg_file = argv[2]; const char *wordlist_file = argv[3]; UNICHARSET unicharset; if (!unicharset.load_from_file(unicharset_file)) { tprintf("Error loading unicharset from %s.\n", unicharset_file); return 1; } tesseract::Dawg *dict = LoadSquishedDawg(unicharset, dawg_file); if (dict == NULL) { tprintf("Error loading dictionary from %s.\n", dawg_file); return 1; } int retval = WriteDawgAsWordlist(unicharset, dict, wordlist_file); delete dict; return retval; }
int Main() { if (FLAGS_list_available_fonts) { const std::vector<std::string>& all_fonts = FontUtils::ListAvailableFonts(); for (unsigned int i = 0; i < all_fonts.size(); ++i) { printf("%3u: %s\n", i, all_fonts[i].c_str()); ASSERT_HOST_MSG(FontUtils::IsAvailableFont(all_fonts[i].c_str()), "Font %s is unrecognized.\n", all_fonts[i].c_str()); } return EXIT_SUCCESS; } // Check validity of input flags. if (FLAGS_text.empty()) { tprintf("'--text' option is missing!\n"); exit(1); } if (FLAGS_outputbase.empty()) { tprintf("'--outputbase' option is missing!\n"); exit(1); } if (!FLAGS_unicharset_file.empty() && FLAGS_render_ngrams) { tprintf("Use '--unicharset_file' only if '--render_ngrams' is set.\n"); exit(1); } if (!FLAGS_find_fonts && !FontUtils::IsAvailableFont(FLAGS_font.c_str())) { std::string pango_name; if (!FontUtils::IsAvailableFont(FLAGS_font.c_str(), &pango_name)) { tprintf("Could not find font named %s.\n", FLAGS_font.c_str()); if (!pango_name.empty()) { tprintf("Pango suggested font %s.\n", pango_name.c_str()); } tprintf("Please correct --font arg.\n"); exit(1); } } if (FLAGS_render_ngrams) FLAGS_output_word_boxes = true; char font_desc_name[1024]; snprintf(font_desc_name, 1024, "%s %d", FLAGS_font.c_str(), static_cast<int>(FLAGS_ptsize)); StringRenderer render(font_desc_name, FLAGS_xsize, FLAGS_ysize); render.set_add_ligatures(FLAGS_ligatures); render.set_leading(FLAGS_leading); render.set_resolution(FLAGS_resolution); render.set_char_spacing(FLAGS_char_spacing * FLAGS_ptsize); render.set_h_margin(FLAGS_margin); render.set_v_margin(FLAGS_margin); render.set_output_word_boxes(FLAGS_output_word_boxes); render.set_box_padding(FLAGS_box_padding); render.set_strip_unrenderable_words(FLAGS_strip_unrenderable_words); render.set_underline_start_prob(FLAGS_underline_start_prob); render.set_underline_continuation_prob(FLAGS_underline_continuation_prob); // Set text rendering orientation and their forms. if (FLAGS_writing_mode == "horizontal") { // Render regular horizontal text (default). render.set_vertical_text(false); render.set_gravity_hint_strong(false); render.set_render_fullwidth_latin(false); } else if (FLAGS_writing_mode == "vertical") { // Render vertical text. Glyph orientation is selected by Pango. render.set_vertical_text(true); render.set_gravity_hint_strong(false); render.set_render_fullwidth_latin(false); } else if (FLAGS_writing_mode == "vertical-upright") { // Render vertical text. Glyph orientation is set to be upright. // Also Basic Latin characters are converted to their fullwidth forms // on rendering, since fullwidth Latin characters are well designed to fit // vertical text lines, while .box files store halfwidth Basic Latin // unichars. render.set_vertical_text(true); render.set_gravity_hint_strong(true); render.set_render_fullwidth_latin(true); } else { tprintf("Invalid writing mode: %s\n", FLAGS_writing_mode.c_str()); exit(1); } std::string src_utf8; // This c_str is NOT redundant! if (!File::ReadFileToString(FLAGS_text.c_str(), &src_utf8)) { tprintf("Failed to read file: %s\n", FLAGS_text.c_str()); exit(1); } // Remove the unicode mark if present. if (strncmp(src_utf8.c_str(), "\xef\xbb\xbf", 3) == 0) { src_utf8.erase(0, 3); } tlog(1, "Render string of size %d\n", src_utf8.length()); if (FLAGS_render_ngrams || FLAGS_only_extract_font_properties) { // Try to preserve behavior of old text2image by expanding inter-word // spaces by a factor of 4. const std::string kSeparator = FLAGS_render_ngrams ? " " : " "; // Also restrict the number of characters per line to try and avoid // line-breaking in the middle of words like "-A", "R$" etc. which are // otherwise allowed by the standard unicode line-breaking rules. const unsigned int kCharsPerLine = (FLAGS_ptsize > 20) ? 50 : 100; std::string rand_utf8; UNICHARSET unicharset; if (FLAGS_render_ngrams && !FLAGS_unicharset_file.empty() && !unicharset.load_from_file(FLAGS_unicharset_file.c_str())) { tprintf("Failed to load unicharset from file %s\n", FLAGS_unicharset_file.c_str()); exit(1); } // If we are rendering ngrams that will be OCRed later, shuffle them so that // tesseract does not have difficulties finding correct baseline, word // spaces, etc. const char *str8 = src_utf8.c_str(); int len = src_utf8.length(); int step; std::vector<std::pair<int, int> > offsets; int offset = SpanUTF8Whitespace(str8); while (offset < len) { step = SpanUTF8NotWhitespace(str8 + offset); offsets.push_back(std::make_pair(offset, step)); offset += step; offset += SpanUTF8Whitespace(str8 + offset); } if (FLAGS_render_ngrams) std::random_shuffle(offsets.begin(), offsets.end()); for (size_t i = 0, line = 1; i < offsets.size(); ++i) { const char *curr_pos = str8 + offsets[i].first; int ngram_len = offsets[i].second; // Skip words that contain characters not in found in unicharset. std::string cleaned = UNICHARSET::CleanupString(curr_pos, ngram_len); if (!FLAGS_unicharset_file.empty() && !unicharset.encodable_string(cleaned.c_str(), nullptr)) { continue; } rand_utf8.append(curr_pos, ngram_len); if (rand_utf8.length() > line * kCharsPerLine) { rand_utf8.append(" \n"); ++line; if (line & 0x1) rand_utf8.append(kSeparator); } else { rand_utf8.append(kSeparator); } } tlog(1, "Rendered ngram string of size %d\n", rand_utf8.length()); src_utf8.swap(rand_utf8); } if (FLAGS_only_extract_font_properties) { tprintf("Extracting font properties only\n"); ExtractFontProperties(src_utf8, &render, FLAGS_outputbase.c_str()); tprintf("Done!\n"); return 0; } int im = 0; std::vector<float> page_rotation; const char* to_render_utf8 = src_utf8.c_str(); tesseract::TRand randomizer; randomizer.set_seed(kRandomSeed); std::vector<std::string> font_names; // We use a two pass mechanism to rotate images in both direction. // The first pass(0) will rotate the images in random directions and // the second pass(1) will mirror those rotations. int num_pass = FLAGS_bidirectional_rotation ? 2 : 1; for (int pass = 0; pass < num_pass; ++pass) { int page_num = 0; std::string font_used; for (size_t offset = 0; offset < strlen(to_render_utf8) && (FLAGS_max_pages == 0 || page_num < FLAGS_max_pages); ++im, ++page_num) { tlog(1, "Starting page %d\n", im); Pix* pix = nullptr; if (FLAGS_find_fonts) { offset += render.RenderAllFontsToImage(FLAGS_min_coverage, to_render_utf8 + offset, strlen(to_render_utf8 + offset), &font_used, &pix); } else { offset += render.RenderToImage(to_render_utf8 + offset, strlen(to_render_utf8 + offset), &pix); } if (pix != nullptr) { float rotation = 0; if (pass == 1) { // Pass 2, do mirror rotation. rotation = -1 * page_rotation[page_num]; } if (FLAGS_degrade_image) { pix = DegradeImage(pix, FLAGS_exposure, &randomizer, FLAGS_rotate_image ? &rotation : nullptr); } render.RotatePageBoxes(rotation); if (pass == 0) { // Pass 1, rotate randomly and store the rotation.. page_rotation.push_back(rotation); } Pix* gray_pix = pixConvertTo8(pix, false); pixDestroy(&pix); Pix* binary = pixThresholdToBinary(gray_pix, 128); pixDestroy(&gray_pix); char tiff_name[1024]; if (FLAGS_find_fonts) { if (FLAGS_render_per_font) { std::string fontname_for_file = tesseract::StringReplace( font_used, " ", "_"); snprintf(tiff_name, 1024, "%s.%s.tif", FLAGS_outputbase.c_str(), fontname_for_file.c_str()); pixWriteTiff(tiff_name, binary, IFF_TIFF_G4, "w"); tprintf("Rendered page %d to file %s\n", im, tiff_name); } else { font_names.push_back(font_used); } } else { snprintf(tiff_name, 1024, "%s.tif", FLAGS_outputbase.c_str()); pixWriteTiff(tiff_name, binary, IFF_TIFF_G4, im == 0 ? "w" : "a"); tprintf("Rendered page %d to file %s\n", im, tiff_name); } // Make individual glyphs if (FLAGS_output_individual_glyph_images) { if (!MakeIndividualGlyphs(binary, render.GetBoxes(), im)) { tprintf("ERROR: Individual glyphs not saved\n"); } } pixDestroy(&binary); } if (FLAGS_find_fonts && offset != 0) { // We just want a list of names, or some sample images so we don't need // to render more than the first page of the text. break; } } } if (!FLAGS_find_fonts) { std::string box_name = FLAGS_outputbase.c_str(); box_name += ".box"; render.WriteAllBoxes(box_name); } else if (!FLAGS_render_per_font && !font_names.empty()) { std::string filename = FLAGS_outputbase.c_str(); filename += ".fontlist.txt"; FILE* fp = fopen(filename.c_str(), "wb"); if (fp == nullptr) { tprintf("Failed to create output font list %s\n", filename.c_str()); } else { for (size_t i = 0; i < font_names.size(); ++i) { fprintf(fp, "%s\n", font_names[i].c_str()); } fclose(fp); } } return 0; }
// Apart from command-line flags, input is a collection of lstmf files, that // were previously created using tesseract with the lstm.train config file. // The program iterates over the inputs, feeding the data to the network, // until the error rate reaches a specified target or max_iterations is reached. int main(int argc, char **argv) { ParseArguments(&argc, &argv); // Purify the model name in case it is based on the network string. if (FLAGS_model_output.empty()) { tprintf("Must provide a --model_output!\n"); return 1; } STRING model_output = FLAGS_model_output.c_str(); for (int i = 0; i < model_output.length(); ++i) { if (model_output[i] == '[' || model_output[i] == ']') model_output[i] = '-'; if (model_output[i] == '(' || model_output[i] == ')') model_output[i] = '_'; } // Setup the trainer. STRING checkpoint_file = FLAGS_model_output.c_str(); checkpoint_file += "_checkpoint"; STRING checkpoint_bak = checkpoint_file + ".bak"; tesseract::LSTMTrainer trainer( NULL, NULL, NULL, NULL, FLAGS_model_output.c_str(), checkpoint_file.c_str(), FLAGS_debug_interval, static_cast<inT64>(FLAGS_max_image_MB) * 1048576); // Reading something from an existing model doesn't require many flags, // so do it now and exit. if (FLAGS_stop_training || FLAGS_debug_network) { if (!trainer.TryLoadingCheckpoint(FLAGS_continue_from.c_str())) { tprintf("Failed to read continue from: %s\n", FLAGS_continue_from.c_str()); return 1; } if (FLAGS_debug_network) { trainer.DebugNetwork(); } else { if (FLAGS_train_mode & tesseract::TF_INT_MODE) trainer.ConvertToInt(); GenericVector<char> recognizer_data; trainer.SaveRecognitionDump(&recognizer_data); if (!tesseract::SaveDataToFile(recognizer_data, FLAGS_model_output.c_str())) { tprintf("Failed to write recognition model : %s\n", FLAGS_model_output.c_str()); } } return 0; } // Get the list of files to process. if (FLAGS_train_listfile.empty()) { tprintf("Must supply a list of training filenames! --train_listfile\n"); return 1; } GenericVector<STRING> filenames; if (!tesseract::LoadFileLinesToStrings(FLAGS_train_listfile.c_str(), &filenames)) { tprintf("Failed to load list of training filenames from %s\n", FLAGS_train_listfile.c_str()); return 1; } UNICHARSET unicharset; // Checkpoints always take priority if they are available. if (trainer.TryLoadingCheckpoint(checkpoint_file.string()) || trainer.TryLoadingCheckpoint(checkpoint_bak.string())) { tprintf("Successfully restored trainer from %s\n", checkpoint_file.string()); } else { if (!FLAGS_continue_from.empty()) { // Load a past model file to improve upon. if (!trainer.TryLoadingCheckpoint(FLAGS_continue_from.c_str())) { tprintf("Failed to continue from: %s\n", FLAGS_continue_from.c_str()); return 1; } tprintf("Continuing from %s\n", FLAGS_continue_from.c_str()); trainer.InitIterations(); } if (FLAGS_continue_from.empty() || FLAGS_append_index >= 0) { // We need a unicharset to start from scratch or append. string unicharset_str; // Character coding to be used by the classifier. if (!unicharset.load_from_file(FLAGS_U.c_str())) { tprintf("Error: must provide a -U unicharset!\n"); return 1; } tesseract::SetupBasicProperties(true, &unicharset); if (FLAGS_append_index >= 0) { tprintf("Appending a new network to an old one!!"); if (FLAGS_continue_from.empty()) { tprintf("Must set --continue_from for appending!\n"); return 1; } } // We are initializing from scratch. trainer.InitCharSet(unicharset, FLAGS_script_dir.c_str(), FLAGS_train_mode); if (!trainer.InitNetwork(FLAGS_net_spec.c_str(), FLAGS_append_index, FLAGS_net_mode, FLAGS_weight_range, FLAGS_learning_rate, FLAGS_momentum)) { tprintf("Failed to create network from spec: %s\n", FLAGS_net_spec.c_str()); return 1; } trainer.set_perfect_delay(FLAGS_perfect_sample_delay); } } if (!trainer.LoadAllTrainingData(filenames)) { tprintf("Load of images failed!!\n"); return 1; } bool best_dumped = true; char* best_model_dump = NULL; size_t best_model_size = 0; STRING best_model_name; tesseract::LSTMTester tester(static_cast<inT64>(FLAGS_max_image_MB) * 1048576); tesseract::TestCallback tester_callback = nullptr; if (!FLAGS_eval_listfile.empty()) { if (!tester.LoadAllEvalData(FLAGS_eval_listfile.c_str())) { tprintf("Failed to load eval data from: %s\n", FLAGS_eval_listfile.c_str()); return 1; } tester_callback = NewPermanentTessCallback(&tester, &tesseract::LSTMTester::RunEvalAsync); } do { // Train a few. int iteration = trainer.training_iteration(); for (int target_iteration = iteration + kNumPagesPerBatch; iteration < target_iteration; iteration = trainer.training_iteration()) { trainer.TrainOnLine(&trainer, false); } STRING log_str; trainer.MaintainCheckpoints(tester_callback, &log_str); tprintf("%s\n", log_str.string()); } while (trainer.best_error_rate() > FLAGS_target_error_rate && (trainer.training_iteration() < FLAGS_max_iterations || FLAGS_max_iterations == 0)); delete tester_callback; tprintf("Finished! Error rate = %g\n", trainer.best_error_rate()); return 0; } /* main */