/********************************************************************** * PrintProtos * * Print the list of prototypes in this class type. **********************************************************************/ void PrintProtos(CLASS_TYPE Class) { inT16 Pid; for (Pid = 0; Pid < Class->NumProtos; Pid++) { cprintf ("Proto %d:\t", Pid); PrintProto (ProtoIn (Class, Pid)); cprintf ("\t"); PrintProtoLine (ProtoIn (Class, Pid)); new_line(); } }
/** * This routine searches thru all of the prototypes in * Class and returns the id of the proto which would provide * the best approximation of Prototype. If no close * approximation can be found, NO_PROTO is returned. * * @param Class class to search for matching old proto in * @param NumMerged # of protos merged into each proto of Class * @param Prototype new proto to find match for * * Globals: none * * @return Id of closest proto in Class or NO_PROTO. * @note Exceptions: none * @note History: Sat Nov 24 11:42:58 1990, DSJ, Created. */ int FindClosestExistingProto(CLASS_TYPE Class, int NumMerged[], PROTOTYPE *Prototype) { PROTO_STRUCT NewProto; PROTO_STRUCT MergedProto; int Pid; PROTO Proto; int BestProto; FLOAT32 BestMatch; FLOAT32 Match, OldMatch, NewMatch; MakeNewFromOld (&NewProto, Prototype); BestProto = NO_PROTO; BestMatch = WORST_MATCH_ALLOWED; for (Pid = 0; Pid < Class->NumProtos; Pid++) { Proto = ProtoIn(Class, Pid); ComputeMergedProto(Proto, &NewProto, (FLOAT32) NumMerged[Pid], 1.0, &MergedProto); OldMatch = CompareProtos(Proto, &MergedProto); NewMatch = CompareProtos(&NewProto, &MergedProto); Match = MIN(OldMatch, NewMatch); if (Match > BestMatch) { BestProto = Pid; BestMatch = Match; } } return BestProto; } /* FindClosestExistingProto */
/********************************************************************** * ClassProtoLength * * Return the length of all the protos in this class. **********************************************************************/ FLOAT32 ClassProtoLength(CLASS_TYPE Class) { inT16 Pid; FLOAT32 TotalLength = 0; for (Pid = 0; Pid < Class->NumProtos; Pid++) { TotalLength += (ProtoIn (Class, Pid))->Length; } return (TotalLength); }
/********************************************************************** * ClassConfigLength * * Return the length of all the protos in this class. **********************************************************************/ FLOAT32 ClassConfigLength(CLASS_TYPE Class, BIT_VECTOR Config) { inT16 Pid; FLOAT32 TotalLength = 0; for (Pid = 0; Pid < Class->NumProtos; Pid++) { if (test_bit (Config, Pid)) { TotalLength += (ProtoIn (Class, Pid))->Length; } } return (TotalLength); }
/*---------------------------------------------------------------------------*/ int FindClosestExistingProto(CLASS_TYPE Class, int NumMerged[], PROTOTYPE *Prototype) { /* ** Parameters: ** Class class to search for matching old proto in ** NumMerged[] # of protos merged into each proto of Class ** Prototype new proto to find match for ** Globals: none ** Operation: This routine searches thru all of the prototypes in ** Class and returns the id of the proto which would provide ** the best approximation of Prototype. If no close ** approximation can be found, NO_PROTO is returned. ** Return: Id of closest proto in Class or NO_PROTO. ** Exceptions: none ** History: Sat Nov 24 11:42:58 1990, DSJ, Created. */ PROTO_STRUCT NewProto; PROTO_STRUCT MergedProto; int Pid; PROTO Proto; int BestProto; FLOAT32 BestMatch; FLOAT32 Match, OldMatch, NewMatch; MakeNewFromOld (&NewProto, Prototype); BestProto = NO_PROTO; BestMatch = WORST_MATCH_ALLOWED; for (Pid = 0; Pid < Class->NumProtos; Pid++) { Proto = ProtoIn(Class, Pid); ComputeMergedProto(Proto, &NewProto, (FLOAT32) NumMerged[Pid], 1.0, &MergedProto); OldMatch = CompareProtos(Proto, &MergedProto); NewMatch = CompareProtos(&NewProto, &MergedProto); Match = MIN(OldMatch, NewMatch); if (Match > BestMatch) { BestProto = Pid; BestMatch = Match; } } return BestProto; } /* FindClosestExistingProto */
/********************************************************************** * ReadProtos * * Read in all the prototype information from a file. Read the number * of lines requested. **********************************************************************/ void ReadProtos(register FILE *File, CLASS_TYPE Class) { register inT16 Pid; register PROTO Proto; int NumProtos; fscanf (File, "%d\n", &NumProtos); Class->NumProtos = NumProtos; Class->MaxNumProtos = NumProtos; Class->Prototypes = (PROTO) Emalloc (sizeof (PROTO_STRUCT) * NumProtos); for (Pid = 0; Pid < NumProtos; Pid++) { Proto = ProtoIn (Class, Pid); fscanf (File, "%f %f %f %f %f %f %f\n", &Proto->X, &Proto->Y, &Proto->Length, &Proto->Angle, &Proto->A, &Proto->B, &Proto->C); } }
/*---------------------------------------------------------------------------*/ void WriteProtos( FILE* File, MERGE_CLASS MergeClass) { float Values[3]; int i; PROTO Proto; fprintf(File, "%s\n", MergeClass->Label); fprintf(File, "%d\n", MergeClass->Class->NumProtos); for(i=0; i < MergeClass->Class->NumProtos; i++) { Proto = ProtoIn(MergeClass->Class,i); fprintf(File, "\t%8.4f %8.4f %8.4f %8.4f ", Proto->X, Proto->Y, Proto->Length, Proto->Angle); Values[0] = Proto->X; Values[1] = Proto->Y; Values[2] = Proto->Angle; Normalize(Values); fprintf(File, "%8.4f %8.4f %8.4f\n", Values[0], Values[1], Values[2]); } } // WriteProtos
/********************************************************************** * WriteOldProtoFile * * Write the protos in the given class to the specified file in the * old proto format. **********************************************************************/ void WriteOldProtoFile(FILE *File, CLASS_TYPE Class) { int Pid; PROTO Proto; /* print old header */ fprintf (File, "6\n"); fprintf (File, "linear essential -0.500000 0.500000\n"); fprintf (File, "linear essential -0.250000 0.750000\n"); fprintf (File, "linear essential 0.000000 1.000000\n"); fprintf (File, "circular essential 0.000000 1.000000\n"); fprintf (File, "linear non-essential -0.500000 0.500000\n"); fprintf (File, "linear non-essential -0.500000 0.500000\n"); for (Pid = 0; Pid < Class->NumProtos; Pid++) { Proto = ProtoIn (Class, Pid); fprintf (File, "significant elliptical 1\n"); fprintf (File, " %9.6f %9.6f %9.6f %9.6f %9.6f %9.6f\n", Proto->X, Proto->Y, Proto->Length, Proto->Angle, 0.0, 0.0); fprintf (File, " %9.6f %9.6f %9.6f %9.6f %9.6f %9.6f\n", 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001); } }
/*---------------------------------------------------------------------------*/ int main (int argc, char **argv) { /* ** Parameters: ** argc number of command line arguments ** argv array of command line arguments ** Globals: none ** Operation: ** This program reads in a text file consisting of feature ** samples from a training page in the following format: ** ** FontName CharName NumberOfFeatureTypes(N) ** FeatureTypeName1 NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** FeatureTypeName2 NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** ... ** FeatureTypeNameN NumberOfFeatures(M) ** Feature1 ** ... ** FeatureM ** FontName CharName ... ** ** The result of this program is a binary inttemp file used by ** the OCR engine. ** Return: none ** Exceptions: none ** History: Fri Aug 18 08:56:17 1989, DSJ, Created. ** Mon May 18 1998, Christy Russson, Revistion started. */ char *PageName; FILE *TrainingPage; FILE *OutFile; LIST CharList; CLUSTERER *Clusterer = NULL; LIST ProtoList = NIL; LABELEDLIST CharSample; PROTOTYPE *Prototype; LIST ClassList = NIL; int Cid, Pid; PROTO Proto; PROTO_STRUCT DummyProto; BIT_VECTOR Config2; MERGE_CLASS MergeClass; INT_TEMPLATES IntTemplates; LIST pCharList, pProtoList; char Filename[MAXNAMESIZE]; tesseract::Classify classify; ParseArguments (argc, argv); if (InputUnicharsetFile == NULL) { InputUnicharsetFile = kInputUnicharsetFile; } if (OutputUnicharsetFile == NULL) { OutputUnicharsetFile = kOutputUnicharsetFile; } if (!unicharset_training.load_from_file(InputUnicharsetFile)) { fprintf(stderr, "Failed to load unicharset from file %s\n" "Building unicharset for mftraining from scratch...\n", InputUnicharsetFile); unicharset_training.clear(); // Space character needed to represent NIL classification. unicharset_training.unichar_insert(" "); } if (InputFontInfoFile != NULL) { FILE* f = fopen(InputFontInfoFile, "r"); if (f == NULL) { fprintf(stderr, "Failed to load font_properties\n"); } else { int italic, bold, fixed, serif, fraktur; while (!feof(f)) { FontInfo fontinfo; fontinfo.name = new char[1024]; fontinfo.properties = 0; if (fscanf(f, "%1024s %i %i %i %i %i\n", fontinfo.name, &italic, &bold, &fixed, &serif, &fraktur) != 6) continue; fontinfo.properties = (italic << 0) + (bold << 1) + (fixed << 2) + (serif << 3) + (fraktur << 4); if (!classify.get_fontinfo_table().contains(fontinfo)) { classify.get_fontinfo_table().push_back(fontinfo); } else { fprintf(stderr, "Font %s already defined\n", fontinfo.name); return 1; } } fclose(f); } } while ((PageName = GetNextFilename(argc, argv)) != NULL) { printf ("Reading %s ...\n", PageName); char *short_name = strrchr(PageName, '/'); if (short_name == NULL) short_name = PageName; else ++short_name; // filename is expected to be of the form [lang].[fontname].exp[num].tr // If it is, then set short_name to be the [fontname]. Otherwise it is just // the file basename with the .tr extension removed. char *font_dot = strchr(short_name, '.'); char *exp_dot = (font_dot != NULL) ? strstr(font_dot, ".exp") : NULL; if (font_dot != NULL && exp_dot != NULL && font_dot != exp_dot) { short_name = new_dup(font_dot + 1); short_name[exp_dot - font_dot - 1] = '\0'; } else { short_name = new_dup(short_name); int len = strlen(short_name); if (!strcmp(short_name + len - 3, ".tr")) short_name[len - 3] = '\0'; } int fontinfo_id; FontInfo fontinfo; fontinfo.name = short_name; fontinfo.properties = 0; // Not used to lookup in the table if (!classify.get_fontinfo_table().contains(fontinfo)) { fontinfo_id = classify.get_fontinfo_table().push_back(fontinfo); printf("%s has no defined properties.\n", short_name); } else { fontinfo_id = classify.get_fontinfo_table().get_id(fontinfo); // Update the properties field fontinfo = classify.get_fontinfo_table().get(fontinfo_id); delete[] short_name; } TrainingPage = Efopen (PageName, "r"); CharList = ReadTrainingSamples (TrainingPage); fclose (TrainingPage); //WriteTrainingSamples (Directory, CharList); pCharList = CharList; iterate(pCharList) { //Cluster CharSample = (LABELEDLIST) first_node (pCharList); // printf ("\nClustering %s ...", CharSample->Label); Clusterer = SetUpForClustering(CharSample, PROGRAM_FEATURE_TYPE); Config.MagicSamples = CharSample->SampleCount; ProtoList = ClusterSamples(Clusterer, &Config); CleanUpUnusedData(ProtoList); //Merge MergeInsignificantProtos(ProtoList, CharSample->Label, Clusterer, &Config); if (strcmp(test_ch, CharSample->Label) == 0) DisplayProtoList(test_ch, ProtoList); ProtoList = RemoveInsignificantProtos(ProtoList, ShowSignificantProtos, ShowInsignificantProtos, Clusterer->SampleSize); FreeClusterer(Clusterer); MergeClass = FindClass (ClassList, CharSample->Label); if (MergeClass == NULL) { MergeClass = NewLabeledClass (CharSample->Label); ClassList = push (ClassList, MergeClass); } Cid = AddConfigToClass(MergeClass->Class); MergeClass->Class->font_set.push_back(fontinfo_id); pProtoList = ProtoList; iterate (pProtoList) { Prototype = (PROTOTYPE *) first_node (pProtoList); // see if proto can be approximated by existing proto Pid = FindClosestExistingProto(MergeClass->Class, MergeClass->NumMerged, Prototype); if (Pid == NO_PROTO) { Pid = AddProtoToClass (MergeClass->Class); Proto = ProtoIn (MergeClass->Class, Pid); MakeNewFromOld (Proto, Prototype); MergeClass->NumMerged[Pid] = 1; } else { MakeNewFromOld (&DummyProto, Prototype); ComputeMergedProto (ProtoIn (MergeClass->Class, Pid), &DummyProto, (FLOAT32) MergeClass->NumMerged[Pid], 1.0, ProtoIn (MergeClass->Class, Pid)); MergeClass->NumMerged[Pid] ++; } Config2 = MergeClass->Class->Configurations[Cid]; AddProtoToConfig (Pid, Config2); } FreeProtoList (&ProtoList); } FreeTrainingSamples (CharList); } //WriteMergedTrainingSamples(Directory,ClassList); WriteMicrofeat(Directory, ClassList); SetUpForFloat2Int(ClassList); IntTemplates = classify.CreateIntTemplates(TrainingData, unicharset_training); strcpy (Filename, ""); if (Directory != NULL) { strcat (Filename, Directory); strcat (Filename, "/"); } strcat (Filename, "inttemp"); #ifdef __UNIX__ OutFile = Efopen (Filename, "w"); #else OutFile = Efopen (Filename, "wb"); #endif classify.WriteIntTemplates(OutFile, IntTemplates, unicharset_training); fclose (OutFile); strcpy (Filename, ""); if (Directory != NULL) { strcat (Filename, Directory); strcat (Filename, "/"); } strcat (Filename, "pffmtable"); // Now create pffmtable. WritePFFMTable(IntTemplates, Filename); // Write updated unicharset to a file. if (!unicharset_training.save_to_file(OutputUnicharsetFile)) { fprintf(stderr, "Failed to save unicharset to file %s\n", OutputUnicharsetFile); exit(1); } printf ("Done!\n"); /**/ FreeLabeledClassList (ClassList); return 0; } /* main */
/** SetUpForFloat2Int **************************************************/ void SetUpForFloat2Int( LIST LabeledClassList) { MERGE_CLASS MergeClass; CLASS_TYPE Class; int NumProtos; int NumConfigs; int NumWords; int i, j; float Values[3]; PROTO NewProto; PROTO OldProto; BIT_VECTOR NewConfig; BIT_VECTOR OldConfig; // printf("Float2Int ...\n"); iterate(LabeledClassList) { UnicityTableEqEq<int> font_set; MergeClass = (MERGE_CLASS) first_node (LabeledClassList); Class = &TrainingData[unicharset_training.unichar_to_id( MergeClass->Label)]; NumProtos = MergeClass->Class->NumProtos; NumConfigs = MergeClass->Class->NumConfigs; font_set.move(&MergeClass->Class->font_set); Class->NumProtos = NumProtos; Class->MaxNumProtos = NumProtos; Class->Prototypes = (PROTO) Emalloc (sizeof(PROTO_STRUCT) * NumProtos); for(i=0; i < NumProtos; i++) { NewProto = ProtoIn(Class, i); OldProto = ProtoIn(MergeClass->Class, i); Values[0] = OldProto->X; Values[1] = OldProto->Y; Values[2] = OldProto->Angle; Normalize(Values); NewProto->X = OldProto->X; NewProto->Y = OldProto->Y; NewProto->Length = OldProto->Length; NewProto->Angle = OldProto->Angle; NewProto->A = Values[0]; NewProto->B = Values[1]; NewProto->C = Values[2]; } Class->NumConfigs = NumConfigs; Class->MaxNumConfigs = NumConfigs; Class->font_set.move(&font_set); Class->Configurations = (BIT_VECTOR*) Emalloc (sizeof(BIT_VECTOR) * NumConfigs); NumWords = WordsInVectorOfSize(NumProtos); for(i=0; i < NumConfigs; i++) { NewConfig = NewBitVector(NumProtos); OldConfig = MergeClass->Class->Configurations[i]; for(j=0; j < NumWords; j++) NewConfig[j] = OldConfig[j]; Class->Configurations[i] = NewConfig; } } } // SetUpForFloat2Int