/** * This program reads in a text file consisting of feature * samples from a training page in the following format: * @verbatim FontName UTF8-char-str xmin ymin xmax ymax page-number NumberOfFeatureTypes(N) FeatureTypeName1 NumberOfFeatures(M) Feature1 ... FeatureM FeatureTypeName2 NumberOfFeatures(M) Feature1 ... FeatureM ... FeatureTypeNameN NumberOfFeatures(M) Feature1 ... FeatureM FontName CharName ... @endverbatim * The result of this program is a binary inttemp file used by * the OCR engine. * @param argc number of command line arguments * @param argv array of command line arguments * @return none * @note Exceptions: none * @note History: Fri Aug 18 08:56:17 1989, DSJ, Created. * @note History: Mon May 18 1998, Christy Russson, Revistion started. */ int main (int argc, char **argv) { ParseArguments(&argc, &argv); ShapeTable* shape_table = NULL; STRING file_prefix; // Load the training data. MasterTrainer* trainer = tesseract::LoadTrainingData(argc, argv, false, &shape_table, &file_prefix); if (trainer == NULL) return 1; // Failed. // Setup an index mapping from the shapes in the shape table to the classes // that will be trained. In keeping with the original design, each shape // with the same list of unichars becomes a different class and the configs // represent the different combinations of fonts. IndexMapBiDi config_map; SetupConfigMap(shape_table, &config_map); WriteShapeTable(file_prefix, *shape_table); // If the shape_table is flat, then either we didn't run shape clustering, or // it did nothing, so we just output the trainer's unicharset. // Otherwise shape_set will hold a fake unicharset with an entry for each // shape in the shape table, and we will output that instead. UNICHARSET shape_set; const UNICHARSET* unicharset = &trainer->unicharset(); // If we ran shapeclustering (and it worked) then at least one shape will // have multiple unichars, so we have to build a fake unicharset. if (shape_table->AnyMultipleUnichars()) { unicharset = &shape_set; // Now build a fake unicharset for the compact shape space to keep the // output modules happy that we are doing things correctly. int num_shapes = config_map.CompactSize(); for (int s = 0; s < num_shapes; ++s) { char shape_label[kMaxShapeLabelLength + 1]; snprintf(shape_label, kMaxShapeLabelLength, "sh%04d", s); shape_set.unichar_insert(shape_label); } } // Now train each config separately. int num_configs = shape_table->NumShapes(); LIST mf_classes = NIL_LIST; for (int s = 0; s < num_configs; ++s) { int unichar_id, font_id; if (unicharset == &shape_set) { // Using fake unichar_ids from the config_map/shape_set. unichar_id = config_map.SparseToCompact(s); } else { // Get the real unichar_id from the shape table/unicharset. shape_table->GetFirstUnicharAndFont(s, &unichar_id, &font_id); } const char* class_label = unicharset->id_to_unichar(unichar_id); mf_classes = ClusterOneConfig(s, class_label, mf_classes, *shape_table, trainer); } STRING inttemp_file = file_prefix; inttemp_file += "inttemp"; STRING pffmtable_file = file_prefix; pffmtable_file += "pffmtable"; CLASS_STRUCT* float_classes = SetUpForFloat2Int(*unicharset, mf_classes); // Now write the inttemp and pffmtable. trainer->WriteInttempAndPFFMTable(trainer->unicharset(), *unicharset, *shape_table, float_classes, inttemp_file.string(), pffmtable_file.string()); for (int c = 0; c < unicharset->size(); ++c) { FreeClassFields(&float_classes[c]); } delete [] float_classes; FreeLabeledClassList(mf_classes); delete trainer; delete shape_table; printf("Done!\n"); if (!FLAGS_test_ch.empty()) { // If we are displaying debug window(s), wait for the user to look at them. printf("Hit return to exit...\n"); while (getchar() != '\n'); } return 0; } /* main */
/*---------------------------------------------------------------------------*/ int main (int argc, char **argv) { /* ** Parameters: ** argc number of command line arguments ** argv array of command line arguments ** Globals: none ** Operation: ** This program reads in a text file consisting of feature ** samples from a training page in the following format: ** ** FontName UTF8-char-str xmin ymin xmax ymax page-number ** NumberOfFeatureTypes(N) ** FeatureTypeName1 NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** FeatureTypeName2 NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** ... ** FeatureTypeNameN NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** FontName CharName ... ** ** The result of this program is a binary inttemp file used by ** the OCR engine. ** Return: none ** Exceptions: none ** History: Fri Aug 18 08:56:17 1989, DSJ, Created. ** Mon May 18 1998, Christy Russson, Revistion started. */ ParseArguments(&argc, &argv); ShapeTable* shape_table = NULL; STRING file_prefix; // Load the training data. MasterTrainer* trainer = tesseract::LoadTrainingData(argc, argv, false, &shape_table, &file_prefix); if (trainer == NULL) return 1; // Failed. // Setup an index mapping from the shapes in the shape table to the classes // that will be trained. In keeping with the original design, each shape // with the same list of unichars becomes a different class and the configs // represent the different combinations of fonts. IndexMapBiDi config_map; SetupConfigMap(shape_table, &config_map); WriteShapeTable(file_prefix, *shape_table); // If the shape_table is flat, then either we didn't run shape clustering, or // it did nothing, so we just output the trainer's unicharset. // Otherwise shape_set will hold a fake unicharset with an entry for each // shape in the shape table, and we will output that instead. UNICHARSET shape_set; const UNICHARSET* unicharset = &trainer->unicharset(); // If we ran shapeclustering (and it worked) then at least one shape will // have multiple unichars, so we have to build a fake unicharset. if (shape_table->AnyMultipleUnichars()) { unicharset = &shape_set; // Now build a fake unicharset for the compact shape space to keep the // output modules happy that we are doing things correctly. int num_shapes = config_map.CompactSize(); for (int s = 0; s < num_shapes; ++s) { char shape_label[kMaxShapeLabelLength + 1]; snprintf(shape_label, kMaxShapeLabelLength, "sh%04d", s); shape_set.unichar_insert(shape_label); } } // Now train each config separately. int num_configs = shape_table->NumShapes(); LIST mf_classes = NIL_LIST; for (int s = 0; s < num_configs; ++s) { int unichar_id, font_id; if (unicharset == &shape_set) { // Using fake unichar_ids from the config_map/shape_set. unichar_id = config_map.SparseToCompact(s); } else { // Get the real unichar_id from the shape table/unicharset. shape_table->GetFirstUnicharAndFont(s, &unichar_id, &font_id); } const char* class_label = unicharset->id_to_unichar(unichar_id); mf_classes = ClusterOneConfig(s, class_label, mf_classes, *shape_table, trainer); } STRING inttemp_file = file_prefix; inttemp_file += "inttemp"; STRING pffmtable_file = file_prefix; pffmtable_file += "pffmtable"; CLASS_STRUCT* float_classes = SetUpForFloat2Int(*unicharset, mf_classes); // Now write the inttemp and pffmtable. trainer->WriteInttempAndPFFMTable(trainer->unicharset(), *unicharset, *shape_table, float_classes, inttemp_file.string(), pffmtable_file.string()); delete [] float_classes; FreeLabeledClassList(mf_classes); delete trainer; delete shape_table; printf("Done!\n"); if (!FLAGS_test_ch.empty()) { // If we are displaying debug window(s), wait for the user to look at them. printf("Hit return to exit...\n"); while (getchar() != '\n'); } return 0; } /* main */