/*---------------------------------------------------------------------------*/ int main (int argc, char **argv) { /* ** Parameters: ** argc number of command line arguments ** argv array of command line arguments ** Globals: none ** Operation: ** This program reads in a text file consisting of feature ** samples from a training page in the following format: ** ** FontName CharName NumberOfFeatureTypes(N) ** FeatureTypeName1 NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** FeatureTypeName2 NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** ... ** FeatureTypeNameN NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** FontName CharName ... ** ** The result of this program is a binary inttemp file used by ** the OCR engine. ** Return: none ** Exceptions: none ** History: Fri Aug 18 08:56:17 1989, DSJ, Created. ** Mon May 18 1998, Christy Russson, Revistion started. */ char *PageName; FILE *TrainingPage; FILE *OutFile; LIST CharList; CLUSTERER *Clusterer = NULL; LIST ProtoList = NIL; LABELEDLIST CharSample; PROTOTYPE *Prototype; LIST ClassList = NIL; int Cid, Pid; PROTO Proto; PROTO_STRUCT DummyProto; BIT_VECTOR Config2; MERGE_CLASS MergeClass; INT_TEMPLATES IntTemplates; LIST pCharList, pProtoList; char Filename[MAXNAMESIZE]; tesseract::Classify classify; ParseArguments (argc, argv); if (InputUnicharsetFile == NULL) { InputUnicharsetFile = kInputUnicharsetFile; } if (OutputUnicharsetFile == NULL) { OutputUnicharsetFile = kOutputUnicharsetFile; } if (!unicharset_training.load_from_file(InputUnicharsetFile)) { fprintf(stderr, "Failed to load unicharset from file %s\n" "Building unicharset for mftraining from scratch...\n", InputUnicharsetFile); unicharset_training.clear(); // Space character needed to represent NIL classification. unicharset_training.unichar_insert(" "); } if (InputFontInfoFile != NULL) { FILE* f = fopen(InputFontInfoFile, "r"); if (f == NULL) { fprintf(stderr, "Failed to load font_properties\n"); } else { int italic, bold, fixed, serif, fraktur; while (!feof(f)) { FontInfo fontinfo; fontinfo.name = new char[1024]; fontinfo.properties = 0; if (fscanf(f, "%1024s %i %i %i %i %i\n", fontinfo.name, &italic, &bold, &fixed, &serif, &fraktur) != 6) continue; fontinfo.properties = (italic << 0) + (bold << 1) + (fixed << 2) + (serif << 3) + (fraktur << 4); if (!classify.get_fontinfo_table().contains(fontinfo)) { classify.get_fontinfo_table().push_back(fontinfo); } else { fprintf(stderr, "Font %s already defined\n", fontinfo.name); return 1; } } fclose(f); } } while ((PageName = GetNextFilename(argc, argv)) != NULL) { printf ("Reading %s ...\n", PageName); char *short_name = strrchr(PageName, '/'); if (short_name == NULL) short_name = PageName; else ++short_name; // filename is expected to be of the form [lang].[fontname].exp[num].tr // If it is, then set short_name to be the [fontname]. Otherwise it is just // the file basename with the .tr extension removed. char *font_dot = strchr(short_name, '.'); char *exp_dot = (font_dot != NULL) ? strstr(font_dot, ".exp") : NULL; if (font_dot != NULL && exp_dot != NULL && font_dot != exp_dot) { short_name = new_dup(font_dot + 1); short_name[exp_dot - font_dot - 1] = '\0'; } else { short_name = new_dup(short_name); int len = strlen(short_name); if (!strcmp(short_name + len - 3, ".tr")) short_name[len - 3] = '\0'; } int fontinfo_id; FontInfo fontinfo; fontinfo.name = short_name; fontinfo.properties = 0; // Not used to lookup in the table if (!classify.get_fontinfo_table().contains(fontinfo)) { fontinfo_id = classify.get_fontinfo_table().push_back(fontinfo); printf("%s has no defined properties.\n", short_name); } else { fontinfo_id = classify.get_fontinfo_table().get_id(fontinfo); // Update the properties field fontinfo = classify.get_fontinfo_table().get(fontinfo_id); delete[] short_name; } TrainingPage = Efopen (PageName, "r"); CharList = ReadTrainingSamples (TrainingPage); fclose (TrainingPage); //WriteTrainingSamples (Directory, CharList); pCharList = CharList; iterate(pCharList) { //Cluster CharSample = (LABELEDLIST) first_node (pCharList); // printf ("\nClustering %s ...", CharSample->Label); Clusterer = SetUpForClustering(CharSample, PROGRAM_FEATURE_TYPE); Config.MagicSamples = CharSample->SampleCount; ProtoList = ClusterSamples(Clusterer, &Config); CleanUpUnusedData(ProtoList); //Merge MergeInsignificantProtos(ProtoList, CharSample->Label, Clusterer, &Config); if (strcmp(test_ch, CharSample->Label) == 0) DisplayProtoList(test_ch, ProtoList); ProtoList = RemoveInsignificantProtos(ProtoList, ShowSignificantProtos, ShowInsignificantProtos, Clusterer->SampleSize); FreeClusterer(Clusterer); MergeClass = FindClass (ClassList, CharSample->Label); if (MergeClass == NULL) { MergeClass = NewLabeledClass (CharSample->Label); ClassList = push (ClassList, MergeClass); } Cid = AddConfigToClass(MergeClass->Class); MergeClass->Class->font_set.push_back(fontinfo_id); pProtoList = ProtoList; iterate (pProtoList) { Prototype = (PROTOTYPE *) first_node (pProtoList); // see if proto can be approximated by existing proto Pid = FindClosestExistingProto(MergeClass->Class, MergeClass->NumMerged, Prototype); if (Pid == NO_PROTO) { Pid = AddProtoToClass (MergeClass->Class); Proto = ProtoIn (MergeClass->Class, Pid); MakeNewFromOld (Proto, Prototype); MergeClass->NumMerged[Pid] = 1; } else { MakeNewFromOld (&DummyProto, Prototype); ComputeMergedProto (ProtoIn (MergeClass->Class, Pid), &DummyProto, (FLOAT32) MergeClass->NumMerged[Pid], 1.0, ProtoIn (MergeClass->Class, Pid)); MergeClass->NumMerged[Pid] ++; } Config2 = MergeClass->Class->Configurations[Cid]; AddProtoToConfig (Pid, Config2); } FreeProtoList (&ProtoList); } FreeTrainingSamples (CharList); } //WriteMergedTrainingSamples(Directory,ClassList); WriteMicrofeat(Directory, ClassList); SetUpForFloat2Int(ClassList); IntTemplates = classify.CreateIntTemplates(TrainingData, unicharset_training); strcpy (Filename, ""); if (Directory != NULL) { strcat (Filename, Directory); strcat (Filename, "/"); } strcat (Filename, "inttemp"); #ifdef __UNIX__ OutFile = Efopen (Filename, "w"); #else OutFile = Efopen (Filename, "wb"); #endif classify.WriteIntTemplates(OutFile, IntTemplates, unicharset_training); fclose (OutFile); strcpy (Filename, ""); if (Directory != NULL) { strcat (Filename, Directory); strcat (Filename, "/"); } strcat (Filename, "pffmtable"); // Now create pffmtable. WritePFFMTable(IntTemplates, Filename); // Write updated unicharset to a file. if (!unicharset_training.save_to_file(OutputUnicharsetFile)) { fprintf(stderr, "Failed to save unicharset to file %s\n", OutputUnicharsetFile); exit(1); } printf ("Done!\n"); /**/ FreeLabeledClassList (ClassList); return 0; } /* main */
/** * This program reads in a text file consisting of feature * samples from a training page in the following format: * @verbatim FontName CharName NumberOfFeatureTypes(N) FeatureTypeName1 NumberOfFeatures(M) Feature1 ... FeatureM FeatureTypeName2 NumberOfFeatures(M) Feature1 ... FeatureM ... FeatureTypeNameN NumberOfFeatures(M) Feature1 ... FeatureM FontName CharName ... @endverbatim * It then appends these samples into a separate file for each * character. The name of the file is * * DirectoryName/FontName/CharName.FeatureTypeName * * The DirectoryName can be specified via a command * line argument. If not specified, it defaults to the * current directory. The format of the resulting files is: * @verbatim NumberOfFeatures(M) Feature1 ... FeatureM NumberOfFeatures(M) ... @endverbatim * The output files each have a header which describes the * type of feature which the file contains. This header is * in the format required by the clusterer. A command line * argument can also be used to specify that only the first * N samples of each class should be used. * @param argc number of command line arguments * @param argv array of command line arguments * @return none * @note Globals: none * @note Exceptions: none * @note History: Fri Aug 18 08:56:17 1989, DSJ, Created. */ int main(int argc, char *argv[]) { // Set the global Config parameters before parsing the command line. Config = CNConfig; const char *PageName; FILE *TrainingPage; LIST CharList = NIL_LIST; CLUSTERER *Clusterer = NULL; LIST ProtoList = NIL_LIST; LIST NormProtoList = NIL_LIST; LIST pCharList; LABELEDLIST CharSample; FEATURE_DEFS_STRUCT FeatureDefs; InitFeatureDefs(&FeatureDefs); ParseArguments(&argc, &argv); int num_fonts = 0; while ((PageName = GetNextFilename(argc, argv)) != NULL) { printf("Reading %s ...\n", PageName); TrainingPage = Efopen(PageName, "rb"); ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE, 100, NULL, TrainingPage, &CharList); fclose(TrainingPage); ++num_fonts; } printf("Clustering ...\n"); // To allow an individual font to form a separate cluster, // reduce the min samples: // Config.MinSamples = 0.5 / num_fonts; pCharList = CharList; // The norm protos will count the source protos, so we keep them here in // freeable_protos, so they can be freed later. GenericVector<LIST> freeable_protos; iterate(pCharList) { //Cluster CharSample = (LABELEDLIST)first_node(pCharList); Clusterer = SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE); if (Clusterer == NULL) { // To avoid a SIGSEGV fprintf(stderr, "Error: NULL clusterer!\n"); return 1; } float SavedMinSamples = Config.MinSamples; // To disable the tendency to produce a single cluster for all fonts, // make MagicSamples an impossible to achieve number: // Config.MagicSamples = CharSample->SampleCount * 10; Config.MagicSamples = CharSample->SampleCount; while (Config.MinSamples > 0.001) { ProtoList = ClusterSamples(Clusterer, &Config); if (NumberOfProtos(ProtoList, 1, 0) > 0) { break; } else { Config.MinSamples *= 0.95; printf("0 significant protos for %s." " Retrying clustering with MinSamples = %f%%\n", CharSample->Label, Config.MinSamples); } } Config.MinSamples = SavedMinSamples; AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label); freeable_protos.push_back(ProtoList); FreeClusterer(Clusterer); } FreeTrainingSamples(CharList); int desc_index = ShortNameToFeatureType(FeatureDefs, PROGRAM_FEATURE_TYPE); WriteNormProtos(FLAGS_D.c_str(), NormProtoList, FeatureDefs.FeatureDesc[desc_index]); FreeNormProtoList(NormProtoList); for (int i = 0; i < freeable_protos.size(); ++i) { FreeProtoList(&freeable_protos[i]); } printf ("\n"); return 0; } // main
/*---------------------------------------------------------------------------*/ int main ( int argc, char **argv) /* ** Parameters: ** argc number of command line arguments ** argv array of command line arguments ** Globals: none ** Operation: ** This program reads in a text file consisting of feature ** samples from a training page in the following format: ** ** FontName CharName NumberOfFeatureTypes(N) ** FeatureTypeName1 NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** FeatureTypeName2 NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** ... ** FeatureTypeNameN NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** FontName CharName ... ** ** It then appends these samples into a separate file for each ** character. The name of the file is ** ** DirectoryName/FontName/CharName.FeatureTypeName ** ** The DirectoryName can be specified via a command ** line argument. If not specified, it defaults to the ** current directory. The format of the resulting files is: ** ** NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** NumberOfFeatures(M) ** ... ** ** The output files each have a header which describes the ** type of feature which the file contains. This header is ** in the format required by the clusterer. A command line ** argument can also be used to specify that only the first ** N samples of each class should be used. ** Return: none ** Exceptions: none ** History: Fri Aug 18 08:56:17 1989, DSJ, Created. */ { char *PageName; FILE *TrainingPage; LIST CharList = NIL_LIST; CLUSTERER *Clusterer = NULL; LIST ProtoList = NIL_LIST; LIST NormProtoList = NIL_LIST; LIST pCharList; LABELEDLIST CharSample; FEATURE_DEFS_STRUCT FeatureDefs; InitFeatureDefs(&FeatureDefs); ParseArguments(argc, argv); int num_fonts = 0; while ((PageName = GetNextFilename(argc, argv)) != NULL) { printf("Reading %s ...\n", PageName); TrainingPage = Efopen(PageName, "r"); ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE, 100, 1.0f / 64.0f, 0.0f, NULL, TrainingPage, &CharList); fclose(TrainingPage); ++num_fonts; } printf("Clustering ...\n"); // To allow an individual font to form a separate cluster, // reduce the min samples: // Config.MinSamples = 0.5 / num_fonts; pCharList = CharList; iterate(pCharList) { //Cluster CharSample = (LABELEDLIST)first_node(pCharList); Clusterer = SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE); float SavedMinSamples = Config.MinSamples; // To disable the tendency to produce a single cluster for all fonts, // make MagicSamples an impossible to achieve number: // Config.MagicSamples = CharSample->SampleCount * 10; Config.MagicSamples = CharSample->SampleCount; while (Config.MinSamples > 0.001) { ProtoList = ClusterSamples(Clusterer, &Config); if (NumberOfProtos(ProtoList, 1, 0) > 0) { break; } else { Config.MinSamples *= 0.95; printf("0 significant protos for %s." " Retrying clustering with MinSamples = %f%%\n", CharSample->Label, Config.MinSamples); } } Config.MinSamples = SavedMinSamples; AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label); } FreeTrainingSamples(CharList); if (Clusterer == NULL) // To avoid a SIGSEGV return 1; WriteNormProtos (Directory, NormProtoList, Clusterer); FreeClusterer(Clusterer); FreeProtoList(&ProtoList); FreeNormProtoList(NormProtoList); printf ("\n"); return 0; } // main
/*---------------------------------------------------------------------------*/ int main ( int argc, char **argv) /* ** Parameters: ** argc number of command line arguments ** argv array of command line arguments ** Globals: none ** Operation: ** This program reads in a text file consisting of feature ** samples from a training page in the following format: ** ** FontName CharName NumberOfFeatureTypes(N) ** FeatureTypeName1 NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** FeatureTypeName2 NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** ... ** FeatureTypeNameN NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** FontName CharName ... ** ** It then appends these samples into a separate file for each ** character. The name of the file is ** ** DirectoryName/FontName/CharName.FeatureTypeName ** ** The DirectoryName can be specified via a command ** line argument. If not specified, it defaults to the ** current directory. The format of the resulting files is: ** ** NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** NumberOfFeatures(M) ** ... ** ** The output files each have a header which describes the ** type of feature which the file contains. This header is ** in the format required by the clusterer. A command line ** argument can also be used to specify that only the first ** N samples of each class should be used. ** Return: none ** Exceptions: none ** History: Fri Aug 18 08:56:17 1989, DSJ, Created. */ { char *PageName; FILE *TrainingPage; LIST CharList = NIL; CLUSTERER *Clusterer = NULL; LIST ProtoList = NIL; LIST NormProtoList = NIL; LIST pCharList; LABELEDLIST CharSample; ParseArguments (argc, argv); while ((PageName = GetNextFilename()) != NULL) { printf ("Reading %s ...\n", PageName); TrainingPage = Efopen (PageName, "r"); ReadTrainingSamples (TrainingPage, &CharList); fclose (TrainingPage); //WriteTrainingSamples (Directory, CharList); } printf("Clustering ...\n"); pCharList = CharList; iterate(pCharList) { //Cluster CharSample = (LABELEDLIST) first_node (pCharList); //printf ("\nClustering %s ...", CharSample->Label); Clusterer = SetUpForClustering(CharSample); float SavedMinSamples = Config.MinSamples; Config.MagicSamples = CharSample->SampleCount; while (Config.MinSamples > 0.001) { ProtoList = ClusterSamples(Clusterer, &Config); if (NumberOfProtos(ProtoList, 1, 0) > 0) break; else { Config.MinSamples *= 0.95; printf("0 significant protos for %s." " Retrying clustering with MinSamples = %f%%\n", CharSample->Label, Config.MinSamples); } } Config.MinSamples = SavedMinSamples; AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label); } FreeTrainingSamples (CharList); WriteNormProtos (Directory, NormProtoList, Clusterer); FreeClusterer(Clusterer); FreeProtoList(&ProtoList); FreeNormProtoList(NormProtoList); printf ("\n"); return 0; } // main