/*---------------------------------------------------------------------------*/ int main (int argc, char **argv) { /* ** Parameters: ** argc number of command line arguments ** argv array of command line arguments ** Globals: none ** Operation: ** This program reads in a text file consisting of feature ** samples from a training page in the following format: ** ** FontName UTF8-char-str xmin ymin xmax ymax page-number ** NumberOfFeatureTypes(N) ** FeatureTypeName1 NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** FeatureTypeName2 NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** ... ** FeatureTypeNameN NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** FontName CharName ... ** ** The result of this program is a binary inttemp file used by ** the OCR engine. ** Return: none ** Exceptions: none ** History: Fri Aug 18 08:56:17 1989, DSJ, Created. ** Mon May 18 1998, Christy Russson, Revistion started. */ ParseArguments(&argc, &argv); ShapeTable* shape_table = NULL; STRING file_prefix; // Load the training data. MasterTrainer* trainer = tesseract::LoadTrainingData(argc, argv, false, &shape_table, &file_prefix); if (trainer == NULL) return 1; // Failed. // Setup an index mapping from the shapes in the shape table to the classes // that will be trained. In keeping with the original design, each shape // with the same list of unichars becomes a different class and the configs // represent the different combinations of fonts. IndexMapBiDi config_map; SetupConfigMap(shape_table, &config_map); WriteShapeTable(file_prefix, *shape_table); // If the shape_table is flat, then either we didn't run shape clustering, or // it did nothing, so we just output the trainer's unicharset. // Otherwise shape_set will hold a fake unicharset with an entry for each // shape in the shape table, and we will output that instead. UNICHARSET shape_set; const UNICHARSET* unicharset = &trainer->unicharset(); // If we ran shapeclustering (and it worked) then at least one shape will // have multiple unichars, so we have to build a fake unicharset. if (shape_table->AnyMultipleUnichars()) { unicharset = &shape_set; // Now build a fake unicharset for the compact shape space to keep the // output modules happy that we are doing things correctly. int num_shapes = config_map.CompactSize(); for (int s = 0; s < num_shapes; ++s) { char shape_label[kMaxShapeLabelLength + 1]; snprintf(shape_label, kMaxShapeLabelLength, "sh%04d", s); shape_set.unichar_insert(shape_label); } } // Now train each config separately. int num_configs = shape_table->NumShapes(); LIST mf_classes = NIL_LIST; for (int s = 0; s < num_configs; ++s) { int unichar_id, font_id; if (unicharset == &shape_set) { // Using fake unichar_ids from the config_map/shape_set. unichar_id = config_map.SparseToCompact(s); } else { // Get the real unichar_id from the shape table/unicharset. shape_table->GetFirstUnicharAndFont(s, &unichar_id, &font_id); } const char* class_label = unicharset->id_to_unichar(unichar_id); mf_classes = ClusterOneConfig(s, class_label, mf_classes, *shape_table, trainer); } STRING inttemp_file = file_prefix; inttemp_file += "inttemp"; STRING pffmtable_file = file_prefix; pffmtable_file += "pffmtable"; CLASS_STRUCT* float_classes = SetUpForFloat2Int(*unicharset, mf_classes); // Now write the inttemp and pffmtable. trainer->WriteInttempAndPFFMTable(trainer->unicharset(), *unicharset, *shape_table, float_classes, inttemp_file.string(), pffmtable_file.string()); delete [] float_classes; FreeLabeledClassList(mf_classes); delete trainer; delete shape_table; printf("Done!\n"); if (!FLAGS_test_ch.empty()) { // If we are displaying debug window(s), wait for the user to look at them. printf("Hit return to exit...\n"); while (getchar() != '\n'); } return 0; } /* main */
/** * This program reads in a text file consisting of feature * samples from a training page in the following format: * @verbatim FontName UTF8-char-str xmin ymin xmax ymax page-number NumberOfFeatureTypes(N) FeatureTypeName1 NumberOfFeatures(M) Feature1 ... FeatureM FeatureTypeName2 NumberOfFeatures(M) Feature1 ... FeatureM ... FeatureTypeNameN NumberOfFeatures(M) Feature1 ... FeatureM FontName CharName ... @endverbatim * The result of this program is a binary inttemp file used by * the OCR engine. * @param argc number of command line arguments * @param argv array of command line arguments * @return none * @note Exceptions: none * @note History: Fri Aug 18 08:56:17 1989, DSJ, Created. * @note History: Mon May 18 1998, Christy Russson, Revistion started. */ int main (int argc, char **argv) { ParseArguments(&argc, &argv); ShapeTable* shape_table = NULL; STRING file_prefix; // Load the training data. MasterTrainer* trainer = tesseract::LoadTrainingData(argc, argv, false, &shape_table, &file_prefix); if (trainer == NULL) return 1; // Failed. // Setup an index mapping from the shapes in the shape table to the classes // that will be trained. In keeping with the original design, each shape // with the same list of unichars becomes a different class and the configs // represent the different combinations of fonts. IndexMapBiDi config_map; SetupConfigMap(shape_table, &config_map); WriteShapeTable(file_prefix, *shape_table); // If the shape_table is flat, then either we didn't run shape clustering, or // it did nothing, so we just output the trainer's unicharset. // Otherwise shape_set will hold a fake unicharset with an entry for each // shape in the shape table, and we will output that instead. UNICHARSET shape_set; const UNICHARSET* unicharset = &trainer->unicharset(); // If we ran shapeclustering (and it worked) then at least one shape will // have multiple unichars, so we have to build a fake unicharset. if (shape_table->AnyMultipleUnichars()) { unicharset = &shape_set; // Now build a fake unicharset for the compact shape space to keep the // output modules happy that we are doing things correctly. int num_shapes = config_map.CompactSize(); for (int s = 0; s < num_shapes; ++s) { char shape_label[kMaxShapeLabelLength + 1]; snprintf(shape_label, kMaxShapeLabelLength, "sh%04d", s); shape_set.unichar_insert(shape_label); } } // Now train each config separately. int num_configs = shape_table->NumShapes(); LIST mf_classes = NIL_LIST; for (int s = 0; s < num_configs; ++s) { int unichar_id, font_id; if (unicharset == &shape_set) { // Using fake unichar_ids from the config_map/shape_set. unichar_id = config_map.SparseToCompact(s); } else { // Get the real unichar_id from the shape table/unicharset. shape_table->GetFirstUnicharAndFont(s, &unichar_id, &font_id); } const char* class_label = unicharset->id_to_unichar(unichar_id); mf_classes = ClusterOneConfig(s, class_label, mf_classes, *shape_table, trainer); } STRING inttemp_file = file_prefix; inttemp_file += "inttemp"; STRING pffmtable_file = file_prefix; pffmtable_file += "pffmtable"; CLASS_STRUCT* float_classes = SetUpForFloat2Int(*unicharset, mf_classes); // Now write the inttemp and pffmtable. trainer->WriteInttempAndPFFMTable(trainer->unicharset(), *unicharset, *shape_table, float_classes, inttemp_file.string(), pffmtable_file.string()); for (int c = 0; c < unicharset->size(); ++c) { FreeClassFields(&float_classes[c]); } delete [] float_classes; FreeLabeledClassList(mf_classes); delete trainer; delete shape_table; printf("Done!\n"); if (!FLAGS_test_ch.empty()) { // If we are displaying debug window(s), wait for the user to look at them. printf("Hit return to exit...\n"); while (getchar() != '\n'); } return 0; } /* main */
/** * Creates a MasterTraininer and loads the training data into it: * Initializes feature_defs and IntegerFX. * Loads the shape_table if shape_table != nullptr. * Loads initial unicharset from -U command-line option. * If FLAGS_T is set, loads the majority of data from there, else: * - Loads font info from -F option. * - Loads xheights from -X option. * - Loads samples from .tr files in remaining command-line args. * - Deletes outliers and computes canonical samples. * - If FLAGS_output_trainer is set, saves the trainer for future use. * Computes canonical and cloud features. * If shape_table is not nullptr, but failed to load, make a fake flat one, * as shape clustering was not run. */ MasterTrainer* LoadTrainingData(int argc, const char* const * argv, bool replication, ShapeTable** shape_table, STRING* file_prefix) { InitFeatureDefs(&feature_defs); InitIntegerFX(); *file_prefix = ""; if (!FLAGS_D.empty()) { *file_prefix += FLAGS_D.c_str(); *file_prefix += "/"; } // If we are shape clustering (nullptr shape_table) or we successfully load // a shape_table written by a previous shape clustering, then // shape_analysis will be true, meaning that the MasterTrainer will replace // some members of the unicharset with their fragments. bool shape_analysis = false; if (shape_table != nullptr) { *shape_table = LoadShapeTable(*file_prefix); if (*shape_table != nullptr) shape_analysis = true; } else { shape_analysis = true; } MasterTrainer* trainer = new MasterTrainer(NM_CHAR_ANISOTROPIC, shape_analysis, replication, FLAGS_debug_level); IntFeatureSpace fs; fs.Init(kBoostXYBuckets, kBoostXYBuckets, kBoostDirBuckets); if (FLAGS_T.empty()) { trainer->LoadUnicharset(FLAGS_U.c_str()); // Get basic font information from font_properties. if (!FLAGS_F.empty()) { if (!trainer->LoadFontInfo(FLAGS_F.c_str())) { delete trainer; return nullptr; } } if (!FLAGS_X.empty()) { if (!trainer->LoadXHeights(FLAGS_X.c_str())) { delete trainer; return nullptr; } } trainer->SetFeatureSpace(fs); const char* page_name; // Load training data from .tr files on the command line. while ((page_name = GetNextFilename(argc, argv)) != nullptr) { tprintf("Reading %s ...\n", page_name); trainer->ReadTrainingSamples(page_name, feature_defs, false); // If there is a file with [lang].[fontname].exp[num].fontinfo present, // read font spacing information in to fontinfo_table. int pagename_len = strlen(page_name); char *fontinfo_file_name = new char[pagename_len + 7]; strncpy(fontinfo_file_name, page_name, pagename_len - 2); // remove "tr" strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo"); // +"fontinfo" trainer->AddSpacingInfo(fontinfo_file_name); delete[] fontinfo_file_name; // Load the images into memory if required by the classifier. if (FLAGS_load_images) { STRING image_name = page_name; // Chop off the tr and replace with tif. Extension must be tif! image_name.truncate_at(image_name.length() - 2); image_name += "tif"; trainer->LoadPageImages(image_name.string()); } } trainer->PostLoadCleanup(); // Write the master trainer if required. if (!FLAGS_output_trainer.empty()) { FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb"); if (fp == nullptr) { tprintf("Can't create saved trainer data!\n"); } else { trainer->Serialize(fp); fclose(fp); } } } else { bool success = false; tprintf("Loading master trainer from file:%s\n", FLAGS_T.c_str()); FILE* fp = fopen(FLAGS_T.c_str(), "rb"); if (fp == nullptr) { tprintf("Can't read file %s to initialize master trainer\n", FLAGS_T.c_str()); } else { success = trainer->DeSerialize(false, fp); fclose(fp); } if (!success) { tprintf("Deserialize of master trainer failed!\n"); delete trainer; return nullptr; } trainer->SetFeatureSpace(fs); } trainer->PreTrainingSetup(); if (!FLAGS_O.empty() && !trainer->unicharset().save_to_file(FLAGS_O.c_str())) { fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str()); delete trainer; return nullptr; } if (shape_table != nullptr) { // If we previously failed to load a shapetable, then shape clustering // wasn't run so make a flat one now. if (*shape_table == nullptr) { *shape_table = new ShapeTable; trainer->SetupFlatShapeTable(*shape_table); tprintf("Flat shape table summary: %s\n", (*shape_table)->SummaryStr().string()); } (*shape_table)->set_unicharset(trainer->unicharset()); } return trainer; }