/*---------------------------------------------------------------------------*/ void LearnBlob (TBLOB * Blob, TEXTROW * Row, char BlobText[], int TextLength) /* ** Parameters: ** Blob blob whose micro-features are to be learned ** Row row of text that blob came from ** BlobText text that corresponds to blob ** TextLength number of characters in blob ** Globals: ** imagefile base filename of the page being learned ** FontName name of font currently being trained on ** Operation: ** Extract micro-features from the specified blob and append ** them to the appropriate file. ** Return: none ** Exceptions: none ** History: 7/28/89, DSJ, Created. */ #define MAXFILENAME 80 #define MAXCHARNAME 20 #define MAXFONTNAME 20 #define TRAIN_SUFFIX ".tr" { static FILE *FeatureFile = NULL; char Filename[MAXFILENAME]; char CharName[MAXCHARNAME]; CHAR_DESC CharDesc; LINE_STATS LineStats; EnterLearnMode; // throw out blobs which do not represent only one character if (TextLength != 1) return; GetLineStatsFromRow(Row, &LineStats); CharDesc = ExtractBlobFeatures (Blob, &LineStats); // if a feature file is not yet open, open it // the name of the file is the name of the image plus TRAIN_SUFFIX if (FeatureFile == NULL) { strcpy(Filename, imagefile); strcat(Filename, TRAIN_SUFFIX); FeatureFile = Efopen (Filename, "w"); cprintf ("TRAINING ... Font name = %s.\n", FontName); } // get the name of the character for this blob chartoname (CharName, BlobText[0], ""); // label the features with a class name and font name fprintf (FeatureFile, "\n%s %s ", FontName, CharName); // write micro-features to file and clean up WriteCharDescription(FeatureFile, CharDesc); FreeCharDescription(CharDesc); } // LearnBlob
/*--------------------------------------------------------------------------*/ void WriteMicrofeat( char *Directory, LIST ClassList) { FILE *File; char Filename[MAXNAMESIZE]; MERGE_CLASS MergeClass; strcpy (Filename, ""); if (Directory != NULL) { strcat (Filename, Directory); strcat (Filename, "/"); } strcat (Filename, "Microfeat"); File = Efopen (Filename, "w"); printf ("\nWriting Merged %s ...", Filename); iterate(ClassList) { MergeClass = (MERGE_CLASS) first_node (ClassList); WriteProtos(File, MergeClass); WriteConfigs(File, MergeClass->Class); } fclose (File); } // WriteMicrofeat
/*----------------------------------------------------------------------------*/ void WriteNormProtos ( char *Directory, LIST LabeledProtoList, CLUSTERER *Clusterer) /* ** Parameters: ** Directory directory to place sample files into ** Globals: ** MaxNumSamples max number of samples per class to write ** Operation: ** This routine writes the specified samples into files which ** are organized according to the font name and character name ** of the samples. ** Return: none ** Exceptions: none ** History: Fri Aug 18 16:17:06 1989, DSJ, Created. */ { FILE *File; char Filename[MAXNAMESIZE]; LABELEDLIST LabeledProto; int N; strcpy (Filename, ""); if (Directory != NULL) { strcat (Filename, Directory); strcat (Filename, "/"); } strcat (Filename, "normproto"); printf ("\nWriting %s ...", Filename); File = Efopen (Filename, "w"); fprintf(File,"%0d\n",Clusterer->SampleSize); WriteParamDesc(File,Clusterer->SampleSize,Clusterer->ParamDesc); iterate(LabeledProtoList) { LabeledProto = (LABELEDLIST) first_node (LabeledProtoList); N = NumberOfProtos(LabeledProto->List, ShowSignificantProtos, ShowInsignificantProtos); if (N < 1) { printf ("\nError! Not enough protos for %s: %d protos" " (%d significant protos" ", %d insignificant protos)\n", LabeledProto->Label, N, NumberOfProtos(LabeledProto->List, 1, 0), NumberOfProtos(LabeledProto->List, 0, 1)); exit(1); } fprintf(File, "\n%s %d\n", LabeledProto->Label, N); WriteProtos(File, Clusterer->SampleSize, LabeledProto->List, ShowSignificantProtos, ShowInsignificantProtos); } fclose (File); } // WriteNormProtos
// Writes stored training data to a .tr file based on the given filename. // Returns false on error. bool Classify::WriteTRFile(const STRING& filename) { STRING tr_filename = filename + ".tr"; FILE* fp = Efopen(tr_filename.string(), "wb"); size_t len = tr_file_data_.length(); bool result = fwrite(&tr_file_data_[0], sizeof(tr_file_data_[0]), len, fp) == len; fclose(fp); tr_file_data_.truncate_at(0); return result; }
/*---------------------------------------------------------------------------*/ void WriteMergedTrainingSamples( char *Directory, LIST ClassList) { FILE *File; char Filename[MAXNAMESIZE]; MERGE_CLASS MergeClass; iterate (ClassList) { MergeClass = (MERGE_CLASS) first_node (ClassList); strcpy (Filename, ""); if (Directory != NULL) { strcat (Filename, Directory); strcat (Filename, "/"); } strcat (Filename, "Merged/"); strcat (Filename, MergeClass->Label); strcat (Filename, PROTO_SUFFIX); printf ("\nWriting Merged %s ...", Filename); File = Efopen (Filename, "w"); WriteOldProtoFile (File, MergeClass->Class); fclose (File); strcpy (Filename, ""); if (Directory != NULL) { strcat (Filename, Directory); strcat (Filename, "/"); } strcat (Filename, "Merged/"); strcat (Filename, MergeClass->Label); strcat (Filename, CONFIG_SUFFIX); printf ("\nWriting Merged %s ...", Filename); File = Efopen (Filename, "w"); WriteOldConfigFile (File, MergeClass->Class); fclose (File); } } // WriteMergedTrainingSamples
/*----------------------------------------------------------------------------*/ void WriteNormProtos ( const char *Directory, LIST LabeledProtoList, CLUSTERER *Clusterer) /* ** Parameters: ** Directory directory to place sample files into ** Operation: ** This routine writes the specified samples into files which ** are organized according to the font name and character name ** of the samples. ** Return: none ** Exceptions: none ** History: Fri Aug 18 16:17:06 1989, DSJ, Created. */ { FILE *File; STRING Filename; LABELEDLIST LabeledProto; int N; Filename = ""; if (Directory != NULL && Directory[0] != '\0') { Filename += Directory; Filename += "/"; } Filename += "normproto"; printf ("\nWriting %s ...", Filename.string()); File = Efopen (Filename.string(), "wb"); fprintf(File,"%0d\n",Clusterer->SampleSize); WriteParamDesc(File,Clusterer->SampleSize,Clusterer->ParamDesc); iterate(LabeledProtoList) { LabeledProto = (LABELEDLIST) first_node (LabeledProtoList); N = NumberOfProtos(LabeledProto->List, true, false); if (N < 1) { printf ("\nError! Not enough protos for %s: %d protos" " (%d significant protos" ", %d insignificant protos)\n", LabeledProto->Label, N, NumberOfProtos(LabeledProto->List, 1, 0), NumberOfProtos(LabeledProto->List, 0, 1)); exit(1); } fprintf(File, "\n%s %d\n", LabeledProto->Label, N); WriteProtos(File, Clusterer->SampleSize, LabeledProto->List, true, false); } fclose (File); } // WriteNormProtos
/*---------------------------------------------------------------------------*/ void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, const STRING& filename, TBLOB * Blob, const DENORM& denorm, const char* BlobText) { /* ** Parameters: ** Blob blob whose micro-features are to be learned ** Row row of text that blob came from ** BlobText text that corresponds to blob ** TextLength number of characters in blob ** Globals: ** imagefile base filename of the page being learned ** classify_font_name ** name of font currently being trained on ** Operation: ** Extract micro-features from the specified blob and append ** them to the appropriate file. ** Return: none ** Exceptions: none ** History: 7/28/89, DSJ, Created. */ #define TRAIN_SUFFIX ".tr" static FILE *FeatureFile = NULL; STRING Filename(filename); // If no fontname was set, try to extract it from the filename STRING CurrFontName = classify_font_name; if (CurrFontName == kUnknownFontName) { // filename is expected to be of the form [lang].[fontname].exp[num] // The [lang], [fontname] and [num] fields should not have '.' characters. const char *basename = strrchr(filename.string(), '/'); const char *firstdot = strchr(basename ? basename : filename.string(), '.'); const char *lastdot = strrchr(filename.string(), '.'); if (firstdot != lastdot && firstdot != NULL && lastdot != NULL) { ++firstdot; CurrFontName = firstdot; CurrFontName[lastdot - firstdot] = '\0'; } } // if a feature file is not yet open, open it // the name of the file is the name of the image plus TRAIN_SUFFIX if (FeatureFile == NULL) { Filename += TRAIN_SUFFIX; FeatureFile = Efopen(Filename.string(), "w"); cprintf("TRAINING ... Font name = %s\n", CurrFontName.string()); } LearnBlob(FeatureDefs, FeatureFile, Blob, denorm, BlobText, CurrFontName.string()); } // LearnBlob
/*----------------------------------------------------------------------------*/ void WriteClusteredTrainingSamples ( char *Directory, LIST ProtoList, CLUSTERER *Clusterer, LABELEDLIST CharSample) /* ** Parameters: ** Directory directory to place sample files into ** Operation: ** This routine writes the specified samples into files which ** are organized according to the font name and character name ** of the samples. ** Return: none ** Exceptions: none ** History: Fri Aug 18 16:17:06 1989, DSJ, Created. */ { FILE *File; char Filename[MAXNAMESIZE]; strcpy (Filename, ""); if (Directory != NULL) { strcat (Filename, Directory); strcat (Filename, "/"); } strcat (Filename, CTFontName); strcat (Filename, "/"); strcat (Filename, CharSample->Label); strcat (Filename, "."); strcat (Filename, PROGRAM_FEATURE_TYPE); strcat (Filename, ".p"); printf ("\nWriting %s ...", Filename); File = Efopen (Filename, "w"); WriteProtoList(File, Clusterer->SampleSize, Clusterer->ParamDesc, ProtoList, ShowSignificantProtos, ShowInsignificantProtos); fclose (File); } /* WriteClusteredTrainingSamples */
/*---------------------------------------------------------------------------*/ void GetNormProtos() { /* ** Parameters: none ** Globals: ** NormProtoFile name of file containing normalization protos ** NormProtos global data structure to hold protos ** Operation: This routine reads in a set of character normalization ** protos from NormProtoFile and places them into NormProtos. ** Return: none ** Exceptions: none ** History: Wed Dec 19 16:24:25 1990, DSJ, Created. */ FILE *File; char name[1024]; strcpy(name, demodir); strcat(name, NormProtoFile); File = Efopen (name, "r"); NormProtos = ReadNormProtos (File); fclose(File); } /* GetNormProtos */
/** * This routine writes the specified samples into files which * are organized according to the font name and character name * of the samples. * @param Directory directory to place sample files into * @param LabeledProtoList List of labeled protos * @param feature_desc Description of the features * @return none * @note Exceptions: none * @note History: Fri Aug 18 16:17:06 1989, DSJ, Created. */ void WriteNormProtos(const char *Directory, LIST LabeledProtoList, const FEATURE_DESC_STRUCT *feature_desc) { FILE *File; STRING Filename; LABELEDLIST LabeledProto; int N; Filename = ""; if (Directory != NULL && Directory[0] != '\0') { Filename += Directory; Filename += "/"; } Filename += "normproto"; printf ("\nWriting %s ...", Filename.string()); File = Efopen (Filename.string(), "wb"); fprintf(File, "%0d\n", feature_desc->NumParams); WriteParamDesc(File, feature_desc->NumParams, feature_desc->ParamDesc); iterate(LabeledProtoList) { LabeledProto = (LABELEDLIST) first_node (LabeledProtoList); N = NumberOfProtos(LabeledProto->List, true, false); if (N < 1) { printf ("\nError! Not enough protos for %s: %d protos" " (%d significant protos" ", %d insignificant protos)\n", LabeledProto->Label, N, NumberOfProtos(LabeledProto->List, 1, 0), NumberOfProtos(LabeledProto->List, 0, 1)); exit(1); } fprintf(File, "\n%s %d\n", LabeledProto->Label, N); WriteProtos(File, feature_desc->NumParams, LabeledProto->List, true, false); } fclose (File); } // WriteNormProtos
/*---------------------------------------------------------------------------*/ void ReadNewCutoffs(const char *Filename, CLASS_TO_INDEX ClassMapper, CLASS_CUTOFF_ARRAY Cutoffs) { /* ** Parameters: ** Filename name of file containing cutoff definitions ** ClassMapper array which maps class id's to class indexes ** Cutoffs array to put cutoffs into ** Globals: none ** Operation: Open Filename, read in all of the class-id/cutoff pairs ** and insert them into the Cutoffs array. Cutoffs are ** inserted in the array so that the array is indexed by ** class index rather than class id. Unused entries in the ** array are set to an arbitrarily high cutoff value. ** Return: none ** Exceptions: none ** History: Wed Feb 20 09:38:26 1991, DSJ, Created. */ FILE *CutoffFile; char Class[UNICHAR_LEN + 1]; CLASS_ID ClassId; int Cutoff; int i; CutoffFile = Efopen (Filename, "r"); for (i = 0; i < MAX_NUM_CLASSES; i++) Cutoffs[i] = MAX_CUTOFF; while (fscanf (CutoffFile, "%" REALLY_QUOTE_IT(UNICHAR_LEN) "s %d", Class, &Cutoff) == 2) { ClassId = unicharset.unichar_to_id(Class); Cutoffs[ClassMapper[ClassId]] = Cutoff; } fclose(CutoffFile); } /* ReadNewCutoffs */
/*--------------------------------------------------------------------------*/ void WritePFFMTable(INT_TEMPLATES Templates, const char* filename) { FILE* fp = Efopen(filename, "wb"); /* then write out each class */ for (int i = 0; i < Templates->NumClasses; i++) { INT_CLASS Class = ClassForClassId (Templates, i); // Todo: Test with min instead of max // int MaxLength = LengthForConfigId(Class, 0); int MaxLength = 0; const char *unichar = unicharset_training.id_to_unichar(i); if (strcmp(unichar, " ") == 0) { unichar = "NULL"; } else if (Class->NumConfigs == 0) { cprintf("Error: no configs for class %s in mftraining\n", unichar); } for (int ConfigId = 0; ConfigId < Class->NumConfigs; ConfigId++) { // Todo: Test with min instead of max // if (LengthForConfigId (Class, ConfigId) < MaxLength) if (Class->ConfigLengths[ConfigId] > MaxLength) MaxLength = Class->ConfigLengths[ConfigId]; } fprintf(fp, "%s %d\n", unichar, MaxLength); } fclose(fp); } // WritePFFMTable
// Creates a MasterTraininer and loads the training data into it: // Initializes feature_defs and IntegerFX. // Loads the shape_table if shape_table != NULL. // Loads initial unicharset from -U command-line option. // If FLAGS_input_trainer is set, loads the majority of data from there, else: // Loads font info from -F option. // Loads xheights from -X option. // Loads samples from .tr files in remaining command-line args. // Deletes outliers and computes canonical samples. // If FLAGS_output_trainer is set, saves the trainer for future use. // Computes canonical and cloud features. // If shape_table is not NULL, but failed to load, make a fake flat one, // as shape clustering was not run. MasterTrainer* LoadTrainingData(int argc, const char* const * argv, bool replication, ShapeTable** shape_table, STRING* file_prefix) { InitFeatureDefs(&feature_defs); InitIntegerFX(); *file_prefix = ""; if (!FLAGS_D.empty()) { *file_prefix += FLAGS_D.c_str(); *file_prefix += "/"; } // If we are shape clustering (NULL shape_table) or we successfully load // a shape_table written by a previous shape clustering, then // shape_analysis will be true, meaning that the MasterTrainer will replace // some members of the unicharset with their fragments. bool shape_analysis = false; if (shape_table != NULL) { *shape_table = LoadShapeTable(*file_prefix); if (*shape_table != NULL) shape_analysis = true; } else { shape_analysis = true; } MasterTrainer* trainer = new MasterTrainer(NM_CHAR_ANISOTROPIC, shape_analysis, replication, FLAGS_debug_level); if (FLAGS_input_trainer.empty()) { trainer->LoadUnicharset(FLAGS_U.c_str()); // Get basic font information from font_properties. if (!FLAGS_F.empty()) { if (!trainer->LoadFontInfo(FLAGS_F.c_str())) { delete trainer; return NULL; } } if (!FLAGS_X.empty()) { if (!trainer->LoadXHeights(FLAGS_X.c_str())) { delete trainer; return NULL; } } IntFeatureSpace fs; fs.Init(kBoostXYBuckets, kBoostXYBuckets, kBoostDirBuckets); trainer->SetFeatureSpace(fs); const char* page_name; // Load training data from .tr files on the command line. while ((page_name = GetNextFilename(argc, argv)) != NULL) { tprintf("Reading %s ...\n", page_name); FILE* fp = Efopen(page_name, "rb"); trainer->ReadTrainingSamples(fp, feature_defs, false); fclose(fp); // If there is a file with [lang].[fontname].exp[num].fontinfo present, // read font spacing information in to fontinfo_table. int pagename_len = strlen(page_name); char *fontinfo_file_name = new char[pagename_len + 7]; strncpy(fontinfo_file_name, page_name, pagename_len - 2); // remove "tr" strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo"); // +"fontinfo" trainer->AddSpacingInfo(fontinfo_file_name); delete[] fontinfo_file_name; // Load the images into memory if required by the classifier. if (FLAGS_load_images) { STRING image_name = page_name; // Chop off the tr and replace with tif. Extension must be tif! image_name.truncate_at(image_name.length() - 2); image_name += "tif"; trainer->LoadPageImages(image_name.string()); } } trainer->PostLoadCleanup(); // Write the master trainer if required. if (!FLAGS_output_trainer.empty()) { FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb"); if (fp == NULL) { tprintf("Can't create saved trainer data!\n"); } else { trainer->Serialize(fp); fclose(fp); } } } else { bool success = false; tprintf("Loading master trainer from file:%s\n", FLAGS_input_trainer.c_str()); FILE* fp = fopen(FLAGS_input_trainer.c_str(), "rb"); if (fp == NULL) { tprintf("Can't read file %s to initialize master trainer\n", FLAGS_input_trainer.c_str()); } else { success = trainer->DeSerialize(false, fp); fclose(fp); } if (!success) { tprintf("Deserialize of master trainer failed!\n"); delete trainer; return NULL; } } trainer->PreTrainingSetup(); if (!FLAGS_O.empty() && !trainer->unicharset().save_to_file(FLAGS_O.c_str())) { fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str()); delete trainer; return NULL; } if (shape_table != NULL) { // If we previously failed to load a shapetable, then shape clustering // wasn't run so make a flat one now. if (*shape_table == NULL) { *shape_table = new ShapeTable; trainer->SetupFlatShapeTable(*shape_table); tprintf("Flat shape table summary: %s\n", (*shape_table)->SummaryStr().string()); } (*shape_table)->set_unicharset(trainer->unicharset()); } return trainer; }
/** * This program reads in a text file consisting of feature * samples from a training page in the following format: * @verbatim FontName CharName NumberOfFeatureTypes(N) FeatureTypeName1 NumberOfFeatures(M) Feature1 ... FeatureM FeatureTypeName2 NumberOfFeatures(M) Feature1 ... FeatureM ... FeatureTypeNameN NumberOfFeatures(M) Feature1 ... FeatureM FontName CharName ... @endverbatim * It then appends these samples into a separate file for each * character. The name of the file is * * DirectoryName/FontName/CharName.FeatureTypeName * * The DirectoryName can be specified via a command * line argument. If not specified, it defaults to the * current directory. The format of the resulting files is: * @verbatim NumberOfFeatures(M) Feature1 ... FeatureM NumberOfFeatures(M) ... @endverbatim * The output files each have a header which describes the * type of feature which the file contains. This header is * in the format required by the clusterer. A command line * argument can also be used to specify that only the first * N samples of each class should be used. * @param argc number of command line arguments * @param argv array of command line arguments * @return none * @note Globals: none * @note Exceptions: none * @note History: Fri Aug 18 08:56:17 1989, DSJ, Created. */ int main(int argc, char *argv[]) { // Set the global Config parameters before parsing the command line. Config = CNConfig; const char *PageName; FILE *TrainingPage; LIST CharList = NIL_LIST; CLUSTERER *Clusterer = NULL; LIST ProtoList = NIL_LIST; LIST NormProtoList = NIL_LIST; LIST pCharList; LABELEDLIST CharSample; FEATURE_DEFS_STRUCT FeatureDefs; InitFeatureDefs(&FeatureDefs); ParseArguments(&argc, &argv); int num_fonts = 0; while ((PageName = GetNextFilename(argc, argv)) != NULL) { printf("Reading %s ...\n", PageName); TrainingPage = Efopen(PageName, "rb"); ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE, 100, NULL, TrainingPage, &CharList); fclose(TrainingPage); ++num_fonts; } printf("Clustering ...\n"); // To allow an individual font to form a separate cluster, // reduce the min samples: // Config.MinSamples = 0.5 / num_fonts; pCharList = CharList; // The norm protos will count the source protos, so we keep them here in // freeable_protos, so they can be freed later. GenericVector<LIST> freeable_protos; iterate(pCharList) { //Cluster CharSample = (LABELEDLIST)first_node(pCharList); Clusterer = SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE); if (Clusterer == NULL) { // To avoid a SIGSEGV fprintf(stderr, "Error: NULL clusterer!\n"); return 1; } float SavedMinSamples = Config.MinSamples; // To disable the tendency to produce a single cluster for all fonts, // make MagicSamples an impossible to achieve number: // Config.MagicSamples = CharSample->SampleCount * 10; Config.MagicSamples = CharSample->SampleCount; while (Config.MinSamples > 0.001) { ProtoList = ClusterSamples(Clusterer, &Config); if (NumberOfProtos(ProtoList, 1, 0) > 0) { break; } else { Config.MinSamples *= 0.95; printf("0 significant protos for %s." " Retrying clustering with MinSamples = %f%%\n", CharSample->Label, Config.MinSamples); } } Config.MinSamples = SavedMinSamples; AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label); freeable_protos.push_back(ProtoList); FreeClusterer(Clusterer); } FreeTrainingSamples(CharList); int desc_index = ShortNameToFeatureType(FeatureDefs, PROGRAM_FEATURE_TYPE); WriteNormProtos(FLAGS_D.c_str(), NormProtoList, FeatureDefs.FeatureDesc[desc_index]); FreeNormProtoList(NormProtoList); for (int i = 0; i < freeable_protos.size(); ++i) { FreeProtoList(&freeable_protos[i]); } printf ("\n"); return 0; } // main
/*---------------------------------------------------------------------------*/ int main ( int argc, char **argv) /* ** Parameters: ** argc number of command line arguments ** argv array of command line arguments ** Globals: none ** Operation: ** This program reads in a text file consisting of feature ** samples from a training page in the following format: ** ** FontName CharName NumberOfFeatureTypes(N) ** FeatureTypeName1 NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** FeatureTypeName2 NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** ... ** FeatureTypeNameN NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** FontName CharName ... ** ** It then appends these samples into a separate file for each ** character. The name of the file is ** ** DirectoryName/FontName/CharName.FeatureTypeName ** ** The DirectoryName can be specified via a command ** line argument. If not specified, it defaults to the ** current directory. The format of the resulting files is: ** ** NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** NumberOfFeatures(M) ** ... ** ** The output files each have a header which describes the ** type of feature which the file contains. This header is ** in the format required by the clusterer. A command line ** argument can also be used to specify that only the first ** N samples of each class should be used. ** Return: none ** Exceptions: none ** History: Fri Aug 18 08:56:17 1989, DSJ, Created. */ { char *PageName; FILE *TrainingPage; LIST CharList = NIL; CLUSTERER *Clusterer = NULL; LIST ProtoList = NIL; LIST NormProtoList = NIL; LIST pCharList; LABELEDLIST CharSample; ParseArguments (argc, argv); while ((PageName = GetNextFilename()) != NULL) { printf ("Reading %s ...\n", PageName); TrainingPage = Efopen (PageName, "r"); ReadTrainingSamples (TrainingPage, &CharList); fclose (TrainingPage); //WriteTrainingSamples (Directory, CharList); } printf("Clustering ...\n"); pCharList = CharList; iterate(pCharList) { //Cluster CharSample = (LABELEDLIST) first_node (pCharList); //printf ("\nClustering %s ...", CharSample->Label); Clusterer = SetUpForClustering(CharSample); float SavedMinSamples = Config.MinSamples; Config.MagicSamples = CharSample->SampleCount; while (Config.MinSamples > 0.001) { ProtoList = ClusterSamples(Clusterer, &Config); if (NumberOfProtos(ProtoList, 1, 0) > 0) break; else { Config.MinSamples *= 0.95; printf("0 significant protos for %s." " Retrying clustering with MinSamples = %f%%\n", CharSample->Label, Config.MinSamples); } } Config.MinSamples = SavedMinSamples; AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label); } FreeTrainingSamples (CharList); WriteNormProtos (Directory, NormProtoList, Clusterer); FreeClusterer(Clusterer); FreeProtoList(&ProtoList); FreeNormProtoList(NormProtoList); printf ("\n"); return 0; } // main
/*---------------------------------------------------------------------------*/ void LearnBlob (const STRING& filename, TBLOB * Blob, TEXTROW * Row, char BlobText[]) /* ** Parameters: ** Blob blob whose micro-features are to be learned ** Row row of text that blob came from ** BlobText text that corresponds to blob ** TextLength number of characters in blob ** Globals: ** imagefile base filename of the page being learned ** classify_font_name ** name of font currently being trained on ** Operation: ** Extract micro-features from the specified blob and append ** them to the appropriate file. ** Return: none ** Exceptions: none ** History: 7/28/89, DSJ, Created. */ #define TRAIN_SUFFIX ".tr" { static FILE *FeatureFile = NULL; STRING Filename(filename); CHAR_DESC CharDesc; LINE_STATS LineStats; EnterLearnMode; GetLineStatsFromRow(Row, &LineStats); CharDesc = ExtractBlobFeatures (Blob, &LineStats); if (CharDesc == NULL) { cprintf("LearnBLob: CharDesc was NULL. Aborting.\n"); return; } // If no fontname was set, try to extract it from the filename char CurrFontName[32] = ""; strncpy(CurrFontName, static_cast<STRING>(classify_font_name).string(), 32); /* if (!strcmp(CurrFontName, "UnknownFont")) { // filename is expected to be of the form [lang].[fontname].exp[num] // The [lang], [fontname] and [num] fields should not have '.' characters. const char *basename = strrchr(filename.string(), '/'); const char *firstdot = strchr(basename, '.'); const char *lastdot = strrchr(filename.string(), '.'); if (firstdot != lastdot && firstdot != NULL && lastdot != NULL) { strncpy(CurrFontName, firstdot + 1, lastdot - firstdot - 1); } } //*/ // if a feature file is not yet open, open it // the name of the file is the name of the image plus TRAIN_SUFFIX if (FeatureFile == NULL) { Filename += TRAIN_SUFFIX; FeatureFile = Efopen (Filename.string(), "w"); cprintf ("TRAINING ... Font name = %s\n", CurrFontName); } // label the features with a class name and font name fprintf (FeatureFile, "\n%s %s ", CurrFontName, BlobText); // write micro-features to file and clean up WriteCharDescription(FeatureFile, CharDesc); FreeCharDescription(CharDesc); } // LearnBlob
/*---------------------------------------------------------------------------*/ int main (int argc, char **argv) { /* ** Parameters: ** argc number of command line arguments ** argv array of command line arguments ** Globals: none ** Operation: ** This program reads in a text file consisting of feature ** samples from a training page in the following format: ** ** FontName CharName NumberOfFeatureTypes(N) ** FeatureTypeName1 NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** FeatureTypeName2 NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** ... ** FeatureTypeNameN NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** FontName CharName ... ** ** The result of this program is a binary inttemp file used by ** the OCR engine. ** Return: none ** Exceptions: none ** History: Fri Aug 18 08:56:17 1989, DSJ, Created. ** Mon May 18 1998, Christy Russson, Revistion started. */ char *PageName; FILE *TrainingPage; FILE *OutFile; LIST CharList; CLUSTERER *Clusterer = NULL; LIST ProtoList = NIL; LABELEDLIST CharSample; PROTOTYPE *Prototype; LIST ClassList = NIL; int Cid, Pid; PROTO Proto; PROTO_STRUCT DummyProto; BIT_VECTOR Config2; MERGE_CLASS MergeClass; INT_TEMPLATES IntTemplates; LIST pCharList, pProtoList; char Filename[MAXNAMESIZE]; tesseract::Classify classify; ParseArguments (argc, argv); if (InputUnicharsetFile == NULL) { InputUnicharsetFile = kInputUnicharsetFile; } if (OutputUnicharsetFile == NULL) { OutputUnicharsetFile = kOutputUnicharsetFile; } if (!unicharset_training.load_from_file(InputUnicharsetFile)) { fprintf(stderr, "Failed to load unicharset from file %s\n" "Building unicharset for mftraining from scratch...\n", InputUnicharsetFile); unicharset_training.clear(); // Space character needed to represent NIL classification. unicharset_training.unichar_insert(" "); } if (InputFontInfoFile != NULL) { FILE* f = fopen(InputFontInfoFile, "r"); if (f == NULL) { fprintf(stderr, "Failed to load font_properties\n"); } else { int italic, bold, fixed, serif, fraktur; while (!feof(f)) { FontInfo fontinfo; fontinfo.name = new char[1024]; fontinfo.properties = 0; if (fscanf(f, "%1024s %i %i %i %i %i\n", fontinfo.name, &italic, &bold, &fixed, &serif, &fraktur) != 6) continue; fontinfo.properties = (italic << 0) + (bold << 1) + (fixed << 2) + (serif << 3) + (fraktur << 4); if (!classify.get_fontinfo_table().contains(fontinfo)) { classify.get_fontinfo_table().push_back(fontinfo); } else { fprintf(stderr, "Font %s already defined\n", fontinfo.name); return 1; } } fclose(f); } } while ((PageName = GetNextFilename(argc, argv)) != NULL) { printf ("Reading %s ...\n", PageName); char *short_name = strrchr(PageName, '/'); if (short_name == NULL) short_name = PageName; else ++short_name; // filename is expected to be of the form [lang].[fontname].exp[num].tr // If it is, then set short_name to be the [fontname]. Otherwise it is just // the file basename with the .tr extension removed. char *font_dot = strchr(short_name, '.'); char *exp_dot = (font_dot != NULL) ? strstr(font_dot, ".exp") : NULL; if (font_dot != NULL && exp_dot != NULL && font_dot != exp_dot) { short_name = new_dup(font_dot + 1); short_name[exp_dot - font_dot - 1] = '\0'; } else { short_name = new_dup(short_name); int len = strlen(short_name); if (!strcmp(short_name + len - 3, ".tr")) short_name[len - 3] = '\0'; } int fontinfo_id; FontInfo fontinfo; fontinfo.name = short_name; fontinfo.properties = 0; // Not used to lookup in the table if (!classify.get_fontinfo_table().contains(fontinfo)) { fontinfo_id = classify.get_fontinfo_table().push_back(fontinfo); printf("%s has no defined properties.\n", short_name); } else { fontinfo_id = classify.get_fontinfo_table().get_id(fontinfo); // Update the properties field fontinfo = classify.get_fontinfo_table().get(fontinfo_id); delete[] short_name; } TrainingPage = Efopen (PageName, "r"); CharList = ReadTrainingSamples (TrainingPage); fclose (TrainingPage); //WriteTrainingSamples (Directory, CharList); pCharList = CharList; iterate(pCharList) { //Cluster CharSample = (LABELEDLIST) first_node (pCharList); // printf ("\nClustering %s ...", CharSample->Label); Clusterer = SetUpForClustering(CharSample, PROGRAM_FEATURE_TYPE); Config.MagicSamples = CharSample->SampleCount; ProtoList = ClusterSamples(Clusterer, &Config); CleanUpUnusedData(ProtoList); //Merge MergeInsignificantProtos(ProtoList, CharSample->Label, Clusterer, &Config); if (strcmp(test_ch, CharSample->Label) == 0) DisplayProtoList(test_ch, ProtoList); ProtoList = RemoveInsignificantProtos(ProtoList, ShowSignificantProtos, ShowInsignificantProtos, Clusterer->SampleSize); FreeClusterer(Clusterer); MergeClass = FindClass (ClassList, CharSample->Label); if (MergeClass == NULL) { MergeClass = NewLabeledClass (CharSample->Label); ClassList = push (ClassList, MergeClass); } Cid = AddConfigToClass(MergeClass->Class); MergeClass->Class->font_set.push_back(fontinfo_id); pProtoList = ProtoList; iterate (pProtoList) { Prototype = (PROTOTYPE *) first_node (pProtoList); // see if proto can be approximated by existing proto Pid = FindClosestExistingProto(MergeClass->Class, MergeClass->NumMerged, Prototype); if (Pid == NO_PROTO) { Pid = AddProtoToClass (MergeClass->Class); Proto = ProtoIn (MergeClass->Class, Pid); MakeNewFromOld (Proto, Prototype); MergeClass->NumMerged[Pid] = 1; } else { MakeNewFromOld (&DummyProto, Prototype); ComputeMergedProto (ProtoIn (MergeClass->Class, Pid), &DummyProto, (FLOAT32) MergeClass->NumMerged[Pid], 1.0, ProtoIn (MergeClass->Class, Pid)); MergeClass->NumMerged[Pid] ++; } Config2 = MergeClass->Class->Configurations[Cid]; AddProtoToConfig (Pid, Config2); } FreeProtoList (&ProtoList); } FreeTrainingSamples (CharList); } //WriteMergedTrainingSamples(Directory,ClassList); WriteMicrofeat(Directory, ClassList); SetUpForFloat2Int(ClassList); IntTemplates = classify.CreateIntTemplates(TrainingData, unicharset_training); strcpy (Filename, ""); if (Directory != NULL) { strcat (Filename, Directory); strcat (Filename, "/"); } strcat (Filename, "inttemp"); #ifdef __UNIX__ OutFile = Efopen (Filename, "w"); #else OutFile = Efopen (Filename, "wb"); #endif classify.WriteIntTemplates(OutFile, IntTemplates, unicharset_training); fclose (OutFile); strcpy (Filename, ""); if (Directory != NULL) { strcat (Filename, Directory); strcat (Filename, "/"); } strcat (Filename, "pffmtable"); // Now create pffmtable. WritePFFMTable(IntTemplates, Filename); // Write updated unicharset to a file. if (!unicharset_training.save_to_file(OutputUnicharsetFile)) { fprintf(stderr, "Failed to save unicharset to file %s\n", OutputUnicharsetFile); exit(1); } printf ("Done!\n"); /**/ FreeLabeledClassList (ClassList); return 0; } /* main */
/*---------------------------------------------------------------------------*/ int main ( int argc, char **argv) /* ** Parameters: ** argc number of command line arguments ** argv array of command line arguments ** Globals: none ** Operation: ** This program reads in a text file consisting of feature ** samples from a training page in the following format: ** ** FontName CharName NumberOfFeatureTypes(N) ** FeatureTypeName1 NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** FeatureTypeName2 NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** ... ** FeatureTypeNameN NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** FontName CharName ... ** ** It then appends these samples into a separate file for each ** character. The name of the file is ** ** DirectoryName/FontName/CharName.FeatureTypeName ** ** The DirectoryName can be specified via a command ** line argument. If not specified, it defaults to the ** current directory. The format of the resulting files is: ** ** NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** NumberOfFeatures(M) ** ... ** ** The output files each have a header which describes the ** type of feature which the file contains. This header is ** in the format required by the clusterer. A command line ** argument can also be used to specify that only the first ** N samples of each class should be used. ** Return: none ** Exceptions: none ** History: Fri Aug 18 08:56:17 1989, DSJ, Created. */ { char *PageName; FILE *TrainingPage; LIST CharList = NIL_LIST; CLUSTERER *Clusterer = NULL; LIST ProtoList = NIL_LIST; LIST NormProtoList = NIL_LIST; LIST pCharList; LABELEDLIST CharSample; FEATURE_DEFS_STRUCT FeatureDefs; InitFeatureDefs(&FeatureDefs); ParseArguments(argc, argv); int num_fonts = 0; while ((PageName = GetNextFilename(argc, argv)) != NULL) { printf("Reading %s ...\n", PageName); TrainingPage = Efopen(PageName, "r"); ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE, 100, 1.0f / 64.0f, 0.0f, NULL, TrainingPage, &CharList); fclose(TrainingPage); ++num_fonts; } printf("Clustering ...\n"); // To allow an individual font to form a separate cluster, // reduce the min samples: // Config.MinSamples = 0.5 / num_fonts; pCharList = CharList; iterate(pCharList) { //Cluster CharSample = (LABELEDLIST)first_node(pCharList); Clusterer = SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE); float SavedMinSamples = Config.MinSamples; // To disable the tendency to produce a single cluster for all fonts, // make MagicSamples an impossible to achieve number: // Config.MagicSamples = CharSample->SampleCount * 10; Config.MagicSamples = CharSample->SampleCount; while (Config.MinSamples > 0.001) { ProtoList = ClusterSamples(Clusterer, &Config); if (NumberOfProtos(ProtoList, 1, 0) > 0) { break; } else { Config.MinSamples *= 0.95; printf("0 significant protos for %s." " Retrying clustering with MinSamples = %f%%\n", CharSample->Label, Config.MinSamples); } } Config.MinSamples = SavedMinSamples; AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label); } FreeTrainingSamples(CharList); if (Clusterer == NULL) // To avoid a SIGSEGV return 1; WriteNormProtos (Directory, NormProtoList, Clusterer); FreeClusterer(Clusterer); FreeProtoList(&ProtoList); FreeNormProtoList(NormProtoList); printf ("\n"); return 0; } // main
/*---------------------------------------------------------------------------*/ void WriteTrainingSamples ( char *Directory, LIST CharList, const char* program_feature_type) /* ** Parameters: ** Directory directory to place sample files into ** FontList list of fonts used in the training samples ** Operation: ** This routine writes the specified samples into files which ** are organized according to the font name and character name ** of the samples. ** Return: none ** Exceptions: none ** History: Fri Aug 18 16:17:06 1989, DSJ, Created. */ { LABELEDLIST CharSample; FEATURE_SET FeatureSet; LIST FeatureList; FILE *File; char Filename[MAXNAMESIZE]; int NumSamples; iterate (CharList) // iterate thru all of the fonts { CharSample = (LABELEDLIST) first_node (CharList); // construct the full pathname for the current samples file strcpy (Filename, ""); if (Directory != NULL) { strcat (Filename, Directory); strcat (Filename, "/"); } strcat (Filename, CTFontName); strcat (Filename, "/"); strcat (Filename, CharSample->Label); strcat (Filename, "."); strcat (Filename, program_feature_type); printf ("\nWriting %s ...", Filename); /* if file does not exist, create a new one with an appropriate header; otherwise append samples to the existing file */ File = fopen (Filename, "r"); if (File == NULL) { File = Efopen (Filename, "w"); WriteOldParamDesc( File, FeatureDefs.FeatureDesc[ShortNameToFeatureType( program_feature_type)]); } else { fclose (File); File = Efopen (Filename, "a"); } // append samples onto the file FeatureList = CharSample->List; NumSamples = 0; iterate (FeatureList) { FeatureSet = (FEATURE_SET) first_node (FeatureList); WriteFeatureSet (File, FeatureSet); NumSamples++; } fclose (File); } } /* WriteTrainingSamples */