Example #1
0
/**
 * This routine searches thru all of the prototypes in
 * Class and returns the id of the proto which would provide
 * the best approximation of Prototype.  If no close
 * approximation can be found, NO_PROTO is returned.
 *
 * @param Class   class to search for matching old proto in
 * @param NumMerged # of protos merged into each proto of Class
 * @param  Prototype new proto to find match for
 *
 * Globals: none
 *
 * @return Id of closest proto in Class or NO_PROTO.
 * @note Exceptions: none
 * @note History: Sat Nov 24 11:42:58 1990, DSJ, Created.
 */
int FindClosestExistingProto(CLASS_TYPE Class, int NumMerged[],
                             PROTOTYPE  *Prototype) {
  PROTO_STRUCT  NewProto;
  PROTO_STRUCT  MergedProto;
  int   Pid;
  PROTO   Proto;
  int   BestProto;
  FLOAT32 BestMatch;
  FLOAT32 Match, OldMatch, NewMatch;

  MakeNewFromOld (&NewProto, Prototype);

  BestProto = NO_PROTO;
  BestMatch = WORST_MATCH_ALLOWED;
  for (Pid = 0; Pid < Class->NumProtos; Pid++) {
    Proto  = ProtoIn(Class, Pid);
    ComputeMergedProto(Proto, &NewProto,
      (FLOAT32) NumMerged[Pid], 1.0, &MergedProto);
    OldMatch = CompareProtos(Proto, &MergedProto);
    NewMatch = CompareProtos(&NewProto, &MergedProto);
    Match = MIN(OldMatch, NewMatch);
    if (Match > BestMatch) {
      BestProto = Pid;
      BestMatch = Match;
    }
  }
  return BestProto;
} /* FindClosestExistingProto */
Example #2
0
/*---------------------------------------------------------------------------*/
int FindClosestExistingProto(CLASS_TYPE Class, int NumMerged[],
                             PROTOTYPE  *Prototype) {
/*
**  Parameters:
**    Class   class to search for matching old proto in
**    NumMerged[] # of protos merged into each proto of Class
**    Prototype new proto to find match for
**  Globals: none
**  Operation: This routine searches thru all of the prototypes in
**    Class and returns the id of the proto which would provide
**    the best approximation of Prototype.  If no close
**    approximation can be found, NO_PROTO is returned.
**  Return: Id of closest proto in Class or NO_PROTO.
**  Exceptions: none
**  History: Sat Nov 24 11:42:58 1990, DSJ, Created.
*/
  PROTO_STRUCT  NewProto;
  PROTO_STRUCT  MergedProto;
  int   Pid;
  PROTO   Proto;
  int   BestProto;
  FLOAT32 BestMatch;
  FLOAT32 Match, OldMatch, NewMatch;

  MakeNewFromOld (&NewProto, Prototype);

  BestProto = NO_PROTO;
  BestMatch = WORST_MATCH_ALLOWED;
  for (Pid = 0; Pid < Class->NumProtos; Pid++) {
    Proto  = ProtoIn(Class, Pid);
    ComputeMergedProto(Proto, &NewProto,
      (FLOAT32) NumMerged[Pid], 1.0, &MergedProto);
    OldMatch = CompareProtos(Proto, &MergedProto);
    NewMatch = CompareProtos(&NewProto, &MergedProto);
    Match = MIN(OldMatch, NewMatch);
    if (Match > BestMatch) {
      BestProto = Pid;
      BestMatch = Match;
    }
  }
  return BestProto;
} /* FindClosestExistingProto */
/*---------------------------------------------------------------------------*/
int main (int argc, char **argv) {
/*
**	Parameters:
**		argc	number of command line arguments
**		argv	array of command line arguments
**	Globals: none
**	Operation:
**		This program reads in a text file consisting of feature
**		samples from a training page in the following format:
**
**			FontName CharName NumberOfFeatureTypes(N)
**			   FeatureTypeName1 NumberOfFeatures(M)
**			      Feature1
**			      ...
**			      FeatureM
**			   FeatureTypeName2 NumberOfFeatures(M)
**			      Feature1
**			      ...
**			      FeatureM
**			   ...
**			   FeatureTypeNameN NumberOfFeatures(M)
**			      Feature1
**			      ...
**			      FeatureM
**			FontName CharName ...
**
**		The result of this program is a binary inttemp file used by
**		the OCR engine.
**	Return: none
**	Exceptions: none
**	History:	Fri Aug 18 08:56:17 1989, DSJ, Created.
**				Mon May 18 1998, Christy Russson, Revistion started.
*/
  char	*PageName;
  FILE	*TrainingPage;
  FILE	*OutFile;
  LIST	CharList;
  CLUSTERER	*Clusterer = NULL;
  LIST		ProtoList = NIL;
  LABELEDLIST CharSample;
  PROTOTYPE	*Prototype;
  LIST   	ClassList = NIL;
  int		Cid, Pid;
  PROTO		Proto;
  PROTO_STRUCT	DummyProto;
  BIT_VECTOR	Config2;
  MERGE_CLASS	MergeClass;
  INT_TEMPLATES	IntTemplates;
  LIST pCharList, pProtoList;
  char Filename[MAXNAMESIZE];
  tesseract::Classify classify;

  ParseArguments (argc, argv);
  if (InputUnicharsetFile == NULL) {
    InputUnicharsetFile = kInputUnicharsetFile;
  }
  if (OutputUnicharsetFile == NULL) {
    OutputUnicharsetFile = kOutputUnicharsetFile;
  }

  if (!unicharset_training.load_from_file(InputUnicharsetFile)) {
    fprintf(stderr, "Failed to load unicharset from file %s\n"
            "Building unicharset for mftraining from scratch...\n",
            InputUnicharsetFile);
    unicharset_training.clear();
    // Space character needed to represent NIL classification.
    unicharset_training.unichar_insert(" ");
  }


  if (InputFontInfoFile != NULL) {
    FILE* f = fopen(InputFontInfoFile, "r");
    if (f == NULL) {
      fprintf(stderr, "Failed to load font_properties\n");
    } else {
      int italic, bold, fixed, serif, fraktur;
      while (!feof(f)) {
        FontInfo fontinfo;
        fontinfo.name = new char[1024];
        fontinfo.properties = 0;
        if (fscanf(f, "%1024s %i %i %i %i %i\n", fontinfo.name,
                   &italic, &bold, &fixed, &serif, &fraktur) != 6)
          continue;
        fontinfo.properties =
            (italic << 0) +
            (bold << 1) +
            (fixed << 2) +
            (serif << 3) +
            (fraktur << 4);
        if (!classify.get_fontinfo_table().contains(fontinfo)) {
          classify.get_fontinfo_table().push_back(fontinfo);
        } else {
          fprintf(stderr, "Font %s already defined\n", fontinfo.name);
          return 1;
        }
      }
      fclose(f);
    }
  }

  while ((PageName = GetNextFilename(argc, argv)) != NULL) {
    printf ("Reading %s ...\n", PageName);
    char *short_name = strrchr(PageName, '/');
    if (short_name == NULL)
      short_name = PageName;
    else
      ++short_name;
    // filename is expected to be of the form [lang].[fontname].exp[num].tr
    // If it is, then set short_name to be the [fontname]. Otherwise it is just
    // the file basename with the .tr extension removed.
    char *font_dot = strchr(short_name, '.');
    char *exp_dot = (font_dot != NULL) ? strstr(font_dot, ".exp") : NULL;
    if (font_dot != NULL && exp_dot != NULL && font_dot != exp_dot) {
      short_name = new_dup(font_dot + 1);
      short_name[exp_dot - font_dot - 1] = '\0';
    } else {
      short_name = new_dup(short_name);
      int len = strlen(short_name);
      if (!strcmp(short_name + len - 3, ".tr"))
        short_name[len - 3] = '\0';
    }
    int fontinfo_id;
    FontInfo fontinfo;
    fontinfo.name = short_name;
    fontinfo.properties = 0;  // Not used to lookup in the table
    if (!classify.get_fontinfo_table().contains(fontinfo)) {
      fontinfo_id = classify.get_fontinfo_table().push_back(fontinfo);
      printf("%s has no defined properties.\n", short_name);
    } else {
      fontinfo_id = classify.get_fontinfo_table().get_id(fontinfo);
      // Update the properties field
      fontinfo = classify.get_fontinfo_table().get(fontinfo_id);
      delete[] short_name;
    }
    TrainingPage = Efopen (PageName, "r");
    CharList = ReadTrainingSamples (TrainingPage);
    fclose (TrainingPage);
    //WriteTrainingSamples (Directory, CharList);
    pCharList = CharList;
    iterate(pCharList) {
      //Cluster
      CharSample = (LABELEDLIST) first_node (pCharList);
//    printf ("\nClustering %s ...", CharSample->Label);
      Clusterer = SetUpForClustering(CharSample, PROGRAM_FEATURE_TYPE);
      Config.MagicSamples = CharSample->SampleCount;
      ProtoList = ClusterSamples(Clusterer, &Config);
      CleanUpUnusedData(ProtoList);

      //Merge
      MergeInsignificantProtos(ProtoList, CharSample->Label,
                               Clusterer, &Config);
      if (strcmp(test_ch, CharSample->Label) == 0)
        DisplayProtoList(test_ch, ProtoList);
      ProtoList = RemoveInsignificantProtos(ProtoList, ShowSignificantProtos,
                                            ShowInsignificantProtos,
                                            Clusterer->SampleSize);
      FreeClusterer(Clusterer);
      MergeClass = FindClass (ClassList, CharSample->Label);
      if (MergeClass == NULL) {
        MergeClass = NewLabeledClass (CharSample->Label);
        ClassList = push (ClassList, MergeClass);
      }
      Cid = AddConfigToClass(MergeClass->Class);
      MergeClass->Class->font_set.push_back(fontinfo_id);
      pProtoList = ProtoList;
      iterate (pProtoList) {
        Prototype = (PROTOTYPE *) first_node (pProtoList);

        // see if proto can be approximated by existing proto
        Pid = FindClosestExistingProto(MergeClass->Class,
                                       MergeClass->NumMerged, Prototype);
        if (Pid == NO_PROTO) {
          Pid = AddProtoToClass (MergeClass->Class);
          Proto = ProtoIn (MergeClass->Class, Pid);
          MakeNewFromOld (Proto, Prototype);
          MergeClass->NumMerged[Pid] = 1;
        }
        else {
          MakeNewFromOld (&DummyProto, Prototype);
          ComputeMergedProto (ProtoIn (MergeClass->Class, Pid), &DummyProto,
              (FLOAT32) MergeClass->NumMerged[Pid], 1.0,
              ProtoIn (MergeClass->Class, Pid));
          MergeClass->NumMerged[Pid] ++;
        }
        Config2 = MergeClass->Class->Configurations[Cid];
        AddProtoToConfig (Pid, Config2);
      }
      FreeProtoList (&ProtoList);
    }
    FreeTrainingSamples (CharList);
  }
  //WriteMergedTrainingSamples(Directory,ClassList);
  WriteMicrofeat(Directory, ClassList);
  SetUpForFloat2Int(ClassList);
  IntTemplates = classify.CreateIntTemplates(TrainingData,
                                             unicharset_training);
  strcpy (Filename, "");
  if (Directory != NULL) {
    strcat (Filename, Directory);
    strcat (Filename, "/");
  }
  strcat (Filename, "inttemp");
#ifdef __UNIX__
  OutFile = Efopen (Filename, "w");
#else
  OutFile = Efopen (Filename, "wb");
#endif
  classify.WriteIntTemplates(OutFile, IntTemplates, unicharset_training);
  fclose (OutFile);
  strcpy (Filename, "");
  if (Directory != NULL) {
    strcat (Filename, Directory);
    strcat (Filename, "/");
  }
  strcat (Filename, "pffmtable");
  // Now create pffmtable.
  WritePFFMTable(IntTemplates, Filename);
  // Write updated unicharset to a file.
  if (!unicharset_training.save_to_file(OutputUnicharsetFile)) {
    fprintf(stderr, "Failed to save unicharset to file %s\n",
            OutputUnicharsetFile);
    exit(1);
  }
  printf ("Done!\n"); /**/
  FreeLabeledClassList (ClassList);
  return 0;
}	/* main */