/*---------------------------------------------------------------------------*/
void
LearnBlob (TBLOB * Blob, TEXTROW * Row, char BlobText[], int TextLength)
/*
 **      Parameters:
 **              Blob            blob whose micro-features are to be learned
 **              Row             row of text that blob came from
 **              BlobText        text that corresponds to blob
 **              TextLength      number of characters in blob
 **      Globals:
 **              imagefile       base filename of the page being learned
 **              FontName        name of font currently being trained on
 **      Operation:
 **              Extract micro-features from the specified blob and append
 **              them to the appropriate file.
 **      Return: none
 **      Exceptions: none
 **      History: 7/28/89, DSJ, Created.
 */
#define MAXFILENAME     80
#define MAXCHARNAME     20
#define MAXFONTNAME     20
#define TRAIN_SUFFIX    ".tr"
{
  static FILE *FeatureFile = NULL;
  char Filename[MAXFILENAME];
  char CharName[MAXCHARNAME];
  CHAR_DESC CharDesc;
  LINE_STATS LineStats;

  EnterLearnMode;

  // throw out blobs which do not represent only one character
  if (TextLength != 1)
    return;

  GetLineStatsFromRow(Row, &LineStats); 

  CharDesc = ExtractBlobFeatures (Blob, &LineStats);

  // if a feature file is not yet open, open it
  // the name of the file is the name of the image plus TRAIN_SUFFIX
  if (FeatureFile == NULL) {
    strcpy(Filename, imagefile); 
    strcat(Filename, TRAIN_SUFFIX); 
    FeatureFile = Efopen (Filename, "w");

    cprintf ("TRAINING ... Font name = %s.\n", FontName);
  }

  // get the name of the character for this blob
  chartoname (CharName, BlobText[0], "");

  // label the features with a class name and font name
  fprintf (FeatureFile, "\n%s %s ", FontName, CharName);

  // write micro-features to file and clean up
  WriteCharDescription(FeatureFile, CharDesc); 
  FreeCharDescription(CharDesc); 

}                                // LearnBlob
/*--------------------------------------------------------------------------*/
void WriteMicrofeat(
    char	*Directory,
	LIST	ClassList)

{
	FILE		*File;
	char		Filename[MAXNAMESIZE];
	MERGE_CLASS MergeClass;

	strcpy (Filename, "");
	if (Directory != NULL)
	{
		strcat (Filename, Directory);
		strcat (Filename, "/");
	}
	strcat (Filename, "Microfeat");
	File = Efopen (Filename, "w");
	printf ("\nWriting Merged %s ...", Filename);
	iterate(ClassList)
	{
		MergeClass = (MERGE_CLASS) first_node (ClassList);
		WriteProtos(File, MergeClass);
		WriteConfigs(File, MergeClass->Class);
	}
	fclose (File);
} // WriteMicrofeat
/*----------------------------------------------------------------------------*/
void WriteNormProtos (
     char	*Directory,
     LIST	LabeledProtoList,
	 CLUSTERER *Clusterer)

/*
**	Parameters:
**		Directory	directory to place sample files into
**	Globals:
**		MaxNumSamples	max number of samples per class to write
**	Operation:
**		This routine writes the specified samples into files which
**		are organized according to the font name and character name
**		of the samples.
**	Return: none
**	Exceptions: none
**	History: Fri Aug 18 16:17:06 1989, DSJ, Created.
*/

{
	FILE		*File;
	char		Filename[MAXNAMESIZE];
	LABELEDLIST LabeledProto;
	int N;

	strcpy (Filename, "");
	if (Directory != NULL)
	{
		strcat (Filename, Directory);
		strcat (Filename, "/");
	}
	strcat (Filename, "normproto");
	printf ("\nWriting %s ...", Filename);
	File = Efopen (Filename, "w");
	fprintf(File,"%0d\n",Clusterer->SampleSize);
	WriteParamDesc(File,Clusterer->SampleSize,Clusterer->ParamDesc);
	iterate(LabeledProtoList)
	{
		LabeledProto = (LABELEDLIST) first_node (LabeledProtoList);
		N = NumberOfProtos(LabeledProto->List,
		ShowSignificantProtos, ShowInsignificantProtos);
                if (N < 1) {
                  printf ("\nError! Not enough protos for %s: %d protos"
                          " (%d significant protos"
                          ", %d insignificant protos)\n",
                          LabeledProto->Label, N,
                          NumberOfProtos(LabeledProto->List, 1, 0),
                          NumberOfProtos(LabeledProto->List, 0, 1));
                  exit(1);
                }
		fprintf(File, "\n%s %d\n", LabeledProto->Label, N);
		WriteProtos(File, Clusterer->SampleSize, LabeledProto->List,
			ShowSignificantProtos, ShowInsignificantProtos);
	}
	fclose (File);

}	// WriteNormProtos
Beispiel #4
0
// Writes stored training data to a .tr file based on the given filename.
// Returns false on error.
bool Classify::WriteTRFile(const STRING& filename) {
  STRING tr_filename = filename + ".tr";
  FILE* fp = Efopen(tr_filename.string(), "wb");
  size_t len = tr_file_data_.length();
  bool result =
      fwrite(&tr_file_data_[0], sizeof(tr_file_data_[0]), len, fp) == len;
  fclose(fp);
  tr_file_data_.truncate_at(0);
  return result;
}
/*---------------------------------------------------------------------------*/
void WriteMergedTrainingSamples(
    char	*Directory,
	LIST ClassList)

{
	FILE		*File;
	char		Filename[MAXNAMESIZE];
	MERGE_CLASS MergeClass;

	iterate (ClassList)
	{
		MergeClass = (MERGE_CLASS) first_node (ClassList);
		strcpy (Filename, "");
		if (Directory != NULL)
		{
			strcat (Filename, Directory);
			strcat (Filename, "/");
		}
		strcat (Filename, "Merged/");
		strcat (Filename, MergeClass->Label);
		strcat (Filename, PROTO_SUFFIX);
		printf ("\nWriting Merged %s ...", Filename);
		File = Efopen (Filename, "w");
		WriteOldProtoFile (File, MergeClass->Class);
		fclose (File);

		strcpy (Filename, "");
		if (Directory != NULL)
		{
			strcat (Filename, Directory);
			strcat (Filename, "/");
		}
		strcat (Filename, "Merged/");
		strcat (Filename, MergeClass->Label);
		strcat (Filename, CONFIG_SUFFIX);
		printf ("\nWriting Merged %s ...", Filename);
		File = Efopen (Filename, "w");
		WriteOldConfigFile (File, MergeClass->Class);
		fclose (File);
	}

}	// WriteMergedTrainingSamples
Beispiel #6
0
/*----------------------------------------------------------------------------*/
void WriteNormProtos (
     const char  *Directory,
     LIST  LabeledProtoList,
   CLUSTERER *Clusterer)

/*
**  Parameters:
**    Directory  directory to place sample files into
**  Operation:
**    This routine writes the specified samples into files which
**    are organized according to the font name and character name
**    of the samples.
**  Return: none
**  Exceptions: none
**  History: Fri Aug 18 16:17:06 1989, DSJ, Created.
*/

{
  FILE    *File;
  STRING Filename;
  LABELEDLIST LabeledProto;
  int N;

  Filename = "";
  if (Directory != NULL && Directory[0] != '\0')
  {
    Filename += Directory;
    Filename += "/";
  }
  Filename += "normproto";
  printf ("\nWriting %s ...", Filename.string());
  File = Efopen (Filename.string(), "wb");
  fprintf(File,"%0d\n",Clusterer->SampleSize);
  WriteParamDesc(File,Clusterer->SampleSize,Clusterer->ParamDesc);
  iterate(LabeledProtoList)
  {
    LabeledProto = (LABELEDLIST) first_node (LabeledProtoList);
    N = NumberOfProtos(LabeledProto->List, true, false);
    if (N < 1) {
      printf ("\nError! Not enough protos for %s: %d protos"
              " (%d significant protos"
              ", %d insignificant protos)\n",
              LabeledProto->Label, N,
              NumberOfProtos(LabeledProto->List, 1, 0),
              NumberOfProtos(LabeledProto->List, 0, 1));
      exit(1);
    }
    fprintf(File, "\n%s %d\n", LabeledProto->Label, N);
    WriteProtos(File, Clusterer->SampleSize, LabeledProto->List, true, false);
  }
  fclose (File);

}  // WriteNormProtos
/*---------------------------------------------------------------------------*/
void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, const STRING& filename,
               TBLOB * Blob, const DENORM& denorm, const char* BlobText) {
/*
 **      Parameters:
 **              Blob            blob whose micro-features are to be learned
 **              Row             row of text that blob came from
 **              BlobText        text that corresponds to blob
 **              TextLength      number of characters in blob
 **      Globals:
 **              imagefile       base filename of the page being learned
 **              classify_font_name
 **                              name of font currently being trained on
 **      Operation:
 **              Extract micro-features from the specified blob and append
 **              them to the appropriate file.
 **      Return: none
 **      Exceptions: none
 **      History: 7/28/89, DSJ, Created.
 */
#define TRAIN_SUFFIX    ".tr"
  static FILE *FeatureFile = NULL;
  STRING Filename(filename);

  // If no fontname was set, try to extract it from the filename
  STRING CurrFontName = classify_font_name;
  if (CurrFontName == kUnknownFontName) {
    // filename is expected to be of the form [lang].[fontname].exp[num]
    // The [lang], [fontname] and [num] fields should not have '.' characters.
    const char *basename = strrchr(filename.string(), '/');
    const char *firstdot = strchr(basename ? basename : filename.string(), '.');
    const char *lastdot  = strrchr(filename.string(), '.');
    if (firstdot != lastdot && firstdot != NULL && lastdot != NULL) {
      ++firstdot;
      CurrFontName = firstdot;
      CurrFontName[lastdot - firstdot] = '\0';
    }
  }

  // if a feature file is not yet open, open it
  // the name of the file is the name of the image plus TRAIN_SUFFIX
  if (FeatureFile == NULL) {
    Filename += TRAIN_SUFFIX;
    FeatureFile = Efopen(Filename.string(), "w");
    cprintf("TRAINING ... Font name = %s\n", CurrFontName.string());
  }

  LearnBlob(FeatureDefs, FeatureFile, Blob, denorm, BlobText,
            CurrFontName.string());
}                                // LearnBlob
/*----------------------------------------------------------------------------*/
void WriteClusteredTrainingSamples (
     char	*Directory,
     LIST	ProtoList,
	 CLUSTERER *Clusterer,
	 LABELEDLIST CharSample)

/*
**	Parameters:
**		Directory	directory to place sample files into
**	Operation:
**		This routine writes the specified samples into files which
**		are organized according to the font name and character name
**		of the samples.
**	Return: none
**	Exceptions: none
**	History: Fri Aug 18 16:17:06 1989, DSJ, Created.
*/

{
	FILE		*File;
	char		Filename[MAXNAMESIZE];

	strcpy (Filename, "");
	if (Directory != NULL)
	{
		strcat (Filename, Directory);
		strcat (Filename, "/");
	}
	strcat (Filename, CTFontName);
	strcat (Filename, "/");
	strcat (Filename, CharSample->Label);
	strcat (Filename, ".");
	strcat (Filename, PROGRAM_FEATURE_TYPE);
	strcat (Filename, ".p");
	printf ("\nWriting %s ...", Filename);
	File = Efopen (Filename, "w");
	WriteProtoList(File, Clusterer->SampleSize, Clusterer->ParamDesc,
		ProtoList, ShowSignificantProtos, ShowInsignificantProtos);
	fclose (File);

}	/* WriteClusteredTrainingSamples */
/*---------------------------------------------------------------------------*/
void GetNormProtos() {
/*
 **	Parameters: none
 **	Globals:
 **		NormProtoFile	name of file containing normalization protos
 **		NormProtos	global data structure to hold protos
 **	Operation: This routine reads in a set of character normalization
 **		protos from NormProtoFile and places them into NormProtos.
 **	Return: none
 **	Exceptions: none
 **	History: Wed Dec 19 16:24:25 1990, DSJ, Created.
 */
  FILE *File;
  char name[1024];

  strcpy(name, demodir);
  strcat(name, NormProtoFile);
  File = Efopen (name, "r");
  NormProtos = ReadNormProtos (File);
  fclose(File);

}                                /* GetNormProtos */
Beispiel #10
0
/**
* This routine writes the specified samples into files which
* are organized according to the font name and character name
* of the samples.
* @param Directory  directory to place sample files into
* @param LabeledProtoList List of labeled protos
* @param feature_desc Description of the features
* @return none
* @note Exceptions: none
* @note History: Fri Aug 18 16:17:06 1989, DSJ, Created.
*/
void WriteNormProtos(const char *Directory, LIST LabeledProtoList,
                     const FEATURE_DESC_STRUCT *feature_desc) {
  FILE    *File;
  STRING Filename;
  LABELEDLIST LabeledProto;
  int N;

  Filename = "";
  if (Directory != NULL && Directory[0] != '\0')
  {
    Filename += Directory;
    Filename += "/";
  }
  Filename += "normproto";
  printf ("\nWriting %s ...", Filename.string());
  File = Efopen (Filename.string(), "wb");
  fprintf(File, "%0d\n", feature_desc->NumParams);
  WriteParamDesc(File, feature_desc->NumParams, feature_desc->ParamDesc);
  iterate(LabeledProtoList)
  {
    LabeledProto = (LABELEDLIST) first_node (LabeledProtoList);
    N = NumberOfProtos(LabeledProto->List, true, false);
    if (N < 1) {
      printf ("\nError! Not enough protos for %s: %d protos"
              " (%d significant protos"
              ", %d insignificant protos)\n",
              LabeledProto->Label, N,
              NumberOfProtos(LabeledProto->List, 1, 0),
              NumberOfProtos(LabeledProto->List, 0, 1));
      exit(1);
    }
    fprintf(File, "\n%s %d\n", LabeledProto->Label, N);
    WriteProtos(File, feature_desc->NumParams, LabeledProto->List, true, false);
  }
  fclose (File);

}  // WriteNormProtos
/*---------------------------------------------------------------------------*/
void ReadNewCutoffs(const char *Filename,
                    CLASS_TO_INDEX ClassMapper,
                    CLASS_CUTOFF_ARRAY Cutoffs) {
/*
 **	Parameters:
 **		Filename	name of file containing cutoff definitions
 **             ClassMapper     array which maps class id's to class indexes
 **		Cutoffs		array to put cutoffs into
 **	Globals: none
 **	Operation: Open Filename, read in all of the class-id/cutoff pairs
 **		and insert them into the Cutoffs array.  Cutoffs are
 **		inserted in the array so that the array is indexed by
 **		class index rather than class id.  Unused entries in the
 **		array are set to an arbitrarily high cutoff value.
 **	Return: none
 **	Exceptions: none
 **	History: Wed Feb 20 09:38:26 1991, DSJ, Created.
 */
  FILE *CutoffFile;
  char Class[UNICHAR_LEN + 1];
  CLASS_ID ClassId;
  int Cutoff;
  int i;

  CutoffFile = Efopen (Filename, "r");

  for (i = 0; i < MAX_NUM_CLASSES; i++)
    Cutoffs[i] = MAX_CUTOFF;

  while (fscanf (CutoffFile, "%" REALLY_QUOTE_IT(UNICHAR_LEN) "s %d",
                 Class, &Cutoff) == 2) {
    ClassId = unicharset.unichar_to_id(Class);
    Cutoffs[ClassMapper[ClassId]] = Cutoff;
  }
  fclose(CutoffFile);

}                                /* ReadNewCutoffs */
/*--------------------------------------------------------------------------*/
void WritePFFMTable(INT_TEMPLATES Templates, const char* filename) {
  FILE* fp = Efopen(filename, "wb");
  /* then write out each class */
  for (int i = 0; i < Templates->NumClasses; i++) {
    INT_CLASS Class = ClassForClassId (Templates, i);
    // Todo: Test with min instead of max
    // int MaxLength = LengthForConfigId(Class, 0);
    int MaxLength = 0;
    const char *unichar = unicharset_training.id_to_unichar(i);
    if (strcmp(unichar, " ") == 0) {
      unichar = "NULL";
    } else if (Class->NumConfigs == 0) {
      cprintf("Error: no configs for class %s in mftraining\n", unichar);
    }
    for (int ConfigId = 0; ConfigId < Class->NumConfigs; ConfigId++) {
      // Todo: Test with min instead of max
      // if (LengthForConfigId (Class, ConfigId) < MaxLength)
      if (Class->ConfigLengths[ConfigId] > MaxLength)
        MaxLength = Class->ConfigLengths[ConfigId];
    }
    fprintf(fp, "%s %d\n", unichar, MaxLength);
  }
  fclose(fp);
} // WritePFFMTable
// Creates a MasterTraininer and loads the training data into it:
// Initializes feature_defs and IntegerFX.
// Loads the shape_table if shape_table != NULL.
// Loads initial unicharset from -U command-line option.
// If FLAGS_input_trainer is set, loads the majority of data from there, else:
//   Loads font info from -F option.
//   Loads xheights from -X option.
//   Loads samples from .tr files in remaining command-line args.
//   Deletes outliers and computes canonical samples.
//   If FLAGS_output_trainer is set, saves the trainer for future use.
// Computes canonical and cloud features.
// If shape_table is not NULL, but failed to load, make a fake flat one,
// as shape clustering was not run.
MasterTrainer* LoadTrainingData(int argc, const char* const * argv,
                                bool replication,
                                ShapeTable** shape_table,
                                STRING* file_prefix) {
  InitFeatureDefs(&feature_defs);
  InitIntegerFX();
  *file_prefix = "";
  if (!FLAGS_D.empty()) {
    *file_prefix += FLAGS_D.c_str();
    *file_prefix += "/";
  }
  // If we are shape clustering (NULL shape_table) or we successfully load
  // a shape_table written by a previous shape clustering, then
  // shape_analysis will be true, meaning that the MasterTrainer will replace
  // some members of the unicharset with their fragments.
  bool shape_analysis = false;
  if (shape_table != NULL) {
    *shape_table = LoadShapeTable(*file_prefix);
    if (*shape_table != NULL)
      shape_analysis = true;
  } else {
    shape_analysis = true;
  }
  MasterTrainer* trainer = new MasterTrainer(NM_CHAR_ANISOTROPIC,
                                             shape_analysis,
                                             replication,
                                             FLAGS_debug_level);
  if (FLAGS_input_trainer.empty()) {
    trainer->LoadUnicharset(FLAGS_U.c_str());
    // Get basic font information from font_properties.
    if (!FLAGS_F.empty()) {
      if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {
        delete trainer;
        return NULL;
      }
    }
    if (!FLAGS_X.empty()) {
      if (!trainer->LoadXHeights(FLAGS_X.c_str())) {
        delete trainer;
        return NULL;
      }
    }
    IntFeatureSpace fs;
    fs.Init(kBoostXYBuckets, kBoostXYBuckets, kBoostDirBuckets);
    trainer->SetFeatureSpace(fs);
    const char* page_name;
    // Load training data from .tr files on the command line.
    while ((page_name = GetNextFilename(argc, argv)) != NULL) {
      tprintf("Reading %s ...\n", page_name);
      FILE* fp = Efopen(page_name, "rb");
      trainer->ReadTrainingSamples(fp, feature_defs, false);
      fclose(fp);

      // If there is a file with [lang].[fontname].exp[num].fontinfo present,
      // read font spacing information in to fontinfo_table.
      int pagename_len = strlen(page_name);
      char *fontinfo_file_name = new char[pagename_len + 7];
      strncpy(fontinfo_file_name, page_name, pagename_len - 2);  // remove "tr"
      strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo");  // +"fontinfo"
      trainer->AddSpacingInfo(fontinfo_file_name);
      delete[] fontinfo_file_name;

      // Load the images into memory if required by the classifier.
      if (FLAGS_load_images) {
        STRING image_name = page_name;
        // Chop off the tr and replace with tif. Extension must be tif!
        image_name.truncate_at(image_name.length() - 2);
        image_name += "tif";
        trainer->LoadPageImages(image_name.string());
      }
    }
    trainer->PostLoadCleanup();
    // Write the master trainer if required.
    if (!FLAGS_output_trainer.empty()) {
      FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb");
      if (fp == NULL) {
        tprintf("Can't create saved trainer data!\n");
      } else {
        trainer->Serialize(fp);
        fclose(fp);
      }
    }
  } else {
    bool success = false;
    tprintf("Loading master trainer from file:%s\n",
            FLAGS_input_trainer.c_str());
    FILE* fp = fopen(FLAGS_input_trainer.c_str(), "rb");
    if (fp == NULL) {
      tprintf("Can't read file %s to initialize master trainer\n",
              FLAGS_input_trainer.c_str());
    } else {
      success = trainer->DeSerialize(false, fp);
      fclose(fp);
    }
    if (!success) {
      tprintf("Deserialize of master trainer failed!\n");
      delete trainer;
      return NULL;
    }
  }
  trainer->PreTrainingSetup();
  if (!FLAGS_O.empty() &&
      !trainer->unicharset().save_to_file(FLAGS_O.c_str())) {
    fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str());
    delete trainer;
    return NULL;
  }
  if (shape_table != NULL) {
    // If we previously failed to load a shapetable, then shape clustering
    // wasn't run so make a flat one now.
    if (*shape_table == NULL) {
      *shape_table = new ShapeTable;
      trainer->SetupFlatShapeTable(*shape_table);
      tprintf("Flat shape table summary: %s\n",
              (*shape_table)->SummaryStr().string());
    }
    (*shape_table)->set_unicharset(trainer->unicharset());
  }
  return trainer;
}
Beispiel #14
0
/**
* This program reads in a text file consisting of feature
* samples from a training page in the following format:
* @verbatim
   FontName CharName NumberOfFeatureTypes(N)
      FeatureTypeName1 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      FeatureTypeName2 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      ...
      FeatureTypeNameN NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
   FontName CharName ...
@endverbatim
* It then appends these samples into a separate file for each
* character.  The name of the file is
*
*   DirectoryName/FontName/CharName.FeatureTypeName
*
* The DirectoryName can be specified via a command
* line argument.  If not specified, it defaults to the
* current directory.  The format of the resulting files is:
* @verbatim
   NumberOfFeatures(M)
      Feature1
      ...
      FeatureM
   NumberOfFeatures(M)
   ...
@endverbatim
* The output files each have a header which describes the
* type of feature which the file contains.  This header is
* in the format required by the clusterer.  A command line
* argument can also be used to specify that only the first
* N samples of each class should be used.
* @param argc  number of command line arguments
* @param argv  array of command line arguments
* @return none
* @note Globals: none
* @note Exceptions: none
* @note History: Fri Aug 18 08:56:17 1989, DSJ, Created.
*/
int main(int argc, char *argv[]) {
  // Set the global Config parameters before parsing the command line.
  Config = CNConfig;

  const char  *PageName;
  FILE  *TrainingPage;
  LIST  CharList = NIL_LIST;
  CLUSTERER  *Clusterer = NULL;
  LIST    ProtoList = NIL_LIST;
  LIST    NormProtoList = NIL_LIST;
  LIST pCharList;
  LABELEDLIST CharSample;
  FEATURE_DEFS_STRUCT FeatureDefs;
  InitFeatureDefs(&FeatureDefs);

  ParseArguments(&argc, &argv);
  int num_fonts = 0;
  while ((PageName = GetNextFilename(argc, argv)) != NULL) {
    printf("Reading %s ...\n", PageName);
    TrainingPage = Efopen(PageName, "rb");
    ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE,
                        100, NULL, TrainingPage, &CharList);
    fclose(TrainingPage);
    ++num_fonts;
  }
  printf("Clustering ...\n");
  // To allow an individual font to form a separate cluster,
  // reduce the min samples:
  // Config.MinSamples = 0.5 / num_fonts;
  pCharList = CharList;
  // The norm protos will count the source protos, so we keep them here in
  // freeable_protos, so they can be freed later.
  GenericVector<LIST> freeable_protos;
  iterate(pCharList) {
    //Cluster
    CharSample = (LABELEDLIST)first_node(pCharList);
    Clusterer =
      SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
    if (Clusterer == NULL) {  // To avoid a SIGSEGV
      fprintf(stderr, "Error: NULL clusterer!\n");
      return 1;
    }
    float SavedMinSamples = Config.MinSamples;
    // To disable the tendency to produce a single cluster for all fonts,
    // make MagicSamples an impossible to achieve number:
    // Config.MagicSamples = CharSample->SampleCount * 10;
    Config.MagicSamples = CharSample->SampleCount;
    while (Config.MinSamples > 0.001) {
      ProtoList = ClusterSamples(Clusterer, &Config);
      if (NumberOfProtos(ProtoList, 1, 0) > 0) {
        break;
      } else {
        Config.MinSamples *= 0.95;
        printf("0 significant protos for %s."
               " Retrying clustering with MinSamples = %f%%\n",
               CharSample->Label, Config.MinSamples);
      }
    }
    Config.MinSamples = SavedMinSamples;
    AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
    freeable_protos.push_back(ProtoList);
    FreeClusterer(Clusterer);
  }
  FreeTrainingSamples(CharList);
  int desc_index = ShortNameToFeatureType(FeatureDefs, PROGRAM_FEATURE_TYPE);
  WriteNormProtos(FLAGS_D.c_str(), NormProtoList,
                  FeatureDefs.FeatureDesc[desc_index]);
  FreeNormProtoList(NormProtoList);
  for (int i = 0; i < freeable_protos.size(); ++i) {
    FreeProtoList(&freeable_protos[i]);
  }
  printf ("\n");
  return 0;
}  // main
/*---------------------------------------------------------------------------*/
int main (
     int	argc,
     char	**argv)

/*
**	Parameters:
**		argc	number of command line arguments
**		argv	array of command line arguments
**	Globals: none
**	Operation:
**		This program reads in a text file consisting of feature
**		samples from a training page in the following format:
**
**			FontName CharName NumberOfFeatureTypes(N)
**			   FeatureTypeName1 NumberOfFeatures(M)
**			      Feature1
**			      ...
**			      FeatureM
**			   FeatureTypeName2 NumberOfFeatures(M)
**			      Feature1
**			      ...
**			      FeatureM
**			   ...
**			   FeatureTypeNameN NumberOfFeatures(M)
**			      Feature1
**			      ...
**			      FeatureM
**			FontName CharName ...
**
**		It then appends these samples into a separate file for each
**		character.  The name of the file is
**
**			DirectoryName/FontName/CharName.FeatureTypeName
**
**		The DirectoryName can be specified via a command
**		line argument.  If not specified, it defaults to the
**		current directory.  The format of the resulting files is:
**
**			NumberOfFeatures(M)
**			   Feature1
**			   ...
**			   FeatureM
**			NumberOfFeatures(M)
**			...
**
**		The output files each have a header which describes the
**		type of feature which the file contains.  This header is
**		in the format required by the clusterer.  A command line
**		argument can also be used to specify that only the first
**		N samples of each class should be used.
**	Return: none
**	Exceptions: none
**	History: Fri Aug 18 08:56:17 1989, DSJ, Created.
*/

{
	char	*PageName;
	FILE	*TrainingPage;
	LIST	CharList = NIL;
	CLUSTERER	*Clusterer = NULL;
	LIST		ProtoList = NIL;
	LIST		NormProtoList = NIL;
	LIST pCharList;
	LABELEDLIST CharSample;

	ParseArguments (argc, argv);
	while ((PageName = GetNextFilename()) != NULL)
	{
		printf ("Reading %s ...\n", PageName);
		TrainingPage = Efopen (PageName, "r");
		ReadTrainingSamples (TrainingPage, &CharList);
		fclose (TrainingPage);
		//WriteTrainingSamples (Directory, CharList);
	}
        printf("Clustering ...\n");
	pCharList = CharList;
	iterate(pCharList)
	{
          //Cluster
          CharSample = (LABELEDLIST) first_node (pCharList);
          //printf ("\nClustering %s ...", CharSample->Label);
          Clusterer = SetUpForClustering(CharSample);
          float SavedMinSamples = Config.MinSamples;
          Config.MagicSamples = CharSample->SampleCount;
          while (Config.MinSamples > 0.001) {
            ProtoList = ClusterSamples(Clusterer, &Config);
            if (NumberOfProtos(ProtoList, 1, 0) > 0)
              break;
            else {
              Config.MinSamples *= 0.95;
              printf("0 significant protos for %s."
                     " Retrying clustering with MinSamples = %f%%\n",
                     CharSample->Label, Config.MinSamples);
            }
          }
          Config.MinSamples = SavedMinSamples;
          AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
	}
	FreeTrainingSamples (CharList);
	WriteNormProtos (Directory, NormProtoList, Clusterer);
	FreeClusterer(Clusterer);
	FreeProtoList(&ProtoList);
	FreeNormProtoList(NormProtoList);
	printf ("\n");
  return 0;
}	// main
Beispiel #16
0
/*---------------------------------------------------------------------------*/
void
LearnBlob (const STRING& filename,
           TBLOB * Blob, TEXTROW * Row, char BlobText[])
/*
 **      Parameters:
 **              Blob            blob whose micro-features are to be learned
 **              Row             row of text that blob came from
 **              BlobText        text that corresponds to blob
 **              TextLength      number of characters in blob
 **      Globals:
 **              imagefile       base filename of the page being learned
 **              classify_font_name
 **                              name of font currently being trained on
 **      Operation:
 **              Extract micro-features from the specified blob and append
 **              them to the appropriate file.
 **      Return: none
 **      Exceptions: none
 **      History: 7/28/89, DSJ, Created.
 */
#define TRAIN_SUFFIX    ".tr"
{
  static FILE *FeatureFile = NULL;
  STRING Filename(filename);
  CHAR_DESC CharDesc;
  LINE_STATS LineStats;

  EnterLearnMode;

  GetLineStatsFromRow(Row, &LineStats);

  CharDesc = ExtractBlobFeatures (Blob, &LineStats);
  if (CharDesc == NULL) {
    cprintf("LearnBLob: CharDesc was NULL. Aborting.\n");
    return;
  }

  // If no fontname was set, try to extract it from the filename
  char CurrFontName[32] = "";
  strncpy(CurrFontName, static_cast<STRING>(classify_font_name).string(), 32);
/*
  if (!strcmp(CurrFontName, "UnknownFont")) {
    // filename is expected to be of the form [lang].[fontname].exp[num]
    // The [lang], [fontname] and [num] fields should not have '.' characters.
    const char *basename = strrchr(filename.string(), '/');
    const char *firstdot  = strchr(basename, '.');
    const char *lastdot  = strrchr(filename.string(), '.');
    if (firstdot != lastdot && firstdot != NULL && lastdot != NULL) {
      strncpy(CurrFontName, firstdot + 1, lastdot - firstdot - 1);
    }
  }
//*/
  // if a feature file is not yet open, open it
  // the name of the file is the name of the image plus TRAIN_SUFFIX
  if (FeatureFile == NULL) {
    Filename += TRAIN_SUFFIX;
    FeatureFile = Efopen (Filename.string(), "w");
    cprintf ("TRAINING ... Font name = %s\n", CurrFontName);
  }

  // label the features with a class name and font name
  fprintf (FeatureFile, "\n%s %s ", CurrFontName, BlobText);

  // write micro-features to file and clean up
  WriteCharDescription(FeatureFile, CharDesc);
  FreeCharDescription(CharDesc);

}                                // LearnBlob
/*---------------------------------------------------------------------------*/
int main (int argc, char **argv) {
/*
**	Parameters:
**		argc	number of command line arguments
**		argv	array of command line arguments
**	Globals: none
**	Operation:
**		This program reads in a text file consisting of feature
**		samples from a training page in the following format:
**
**			FontName CharName NumberOfFeatureTypes(N)
**			   FeatureTypeName1 NumberOfFeatures(M)
**			      Feature1
**			      ...
**			      FeatureM
**			   FeatureTypeName2 NumberOfFeatures(M)
**			      Feature1
**			      ...
**			      FeatureM
**			   ...
**			   FeatureTypeNameN NumberOfFeatures(M)
**			      Feature1
**			      ...
**			      FeatureM
**			FontName CharName ...
**
**		The result of this program is a binary inttemp file used by
**		the OCR engine.
**	Return: none
**	Exceptions: none
**	History:	Fri Aug 18 08:56:17 1989, DSJ, Created.
**				Mon May 18 1998, Christy Russson, Revistion started.
*/
  char	*PageName;
  FILE	*TrainingPage;
  FILE	*OutFile;
  LIST	CharList;
  CLUSTERER	*Clusterer = NULL;
  LIST		ProtoList = NIL;
  LABELEDLIST CharSample;
  PROTOTYPE	*Prototype;
  LIST   	ClassList = NIL;
  int		Cid, Pid;
  PROTO		Proto;
  PROTO_STRUCT	DummyProto;
  BIT_VECTOR	Config2;
  MERGE_CLASS	MergeClass;
  INT_TEMPLATES	IntTemplates;
  LIST pCharList, pProtoList;
  char Filename[MAXNAMESIZE];
  tesseract::Classify classify;

  ParseArguments (argc, argv);
  if (InputUnicharsetFile == NULL) {
    InputUnicharsetFile = kInputUnicharsetFile;
  }
  if (OutputUnicharsetFile == NULL) {
    OutputUnicharsetFile = kOutputUnicharsetFile;
  }

  if (!unicharset_training.load_from_file(InputUnicharsetFile)) {
    fprintf(stderr, "Failed to load unicharset from file %s\n"
            "Building unicharset for mftraining from scratch...\n",
            InputUnicharsetFile);
    unicharset_training.clear();
    // Space character needed to represent NIL classification.
    unicharset_training.unichar_insert(" ");
  }


  if (InputFontInfoFile != NULL) {
    FILE* f = fopen(InputFontInfoFile, "r");
    if (f == NULL) {
      fprintf(stderr, "Failed to load font_properties\n");
    } else {
      int italic, bold, fixed, serif, fraktur;
      while (!feof(f)) {
        FontInfo fontinfo;
        fontinfo.name = new char[1024];
        fontinfo.properties = 0;
        if (fscanf(f, "%1024s %i %i %i %i %i\n", fontinfo.name,
                   &italic, &bold, &fixed, &serif, &fraktur) != 6)
          continue;
        fontinfo.properties =
            (italic << 0) +
            (bold << 1) +
            (fixed << 2) +
            (serif << 3) +
            (fraktur << 4);
        if (!classify.get_fontinfo_table().contains(fontinfo)) {
          classify.get_fontinfo_table().push_back(fontinfo);
        } else {
          fprintf(stderr, "Font %s already defined\n", fontinfo.name);
          return 1;
        }
      }
      fclose(f);
    }
  }

  while ((PageName = GetNextFilename(argc, argv)) != NULL) {
    printf ("Reading %s ...\n", PageName);
    char *short_name = strrchr(PageName, '/');
    if (short_name == NULL)
      short_name = PageName;
    else
      ++short_name;
    // filename is expected to be of the form [lang].[fontname].exp[num].tr
    // If it is, then set short_name to be the [fontname]. Otherwise it is just
    // the file basename with the .tr extension removed.
    char *font_dot = strchr(short_name, '.');
    char *exp_dot = (font_dot != NULL) ? strstr(font_dot, ".exp") : NULL;
    if (font_dot != NULL && exp_dot != NULL && font_dot != exp_dot) {
      short_name = new_dup(font_dot + 1);
      short_name[exp_dot - font_dot - 1] = '\0';
    } else {
      short_name = new_dup(short_name);
      int len = strlen(short_name);
      if (!strcmp(short_name + len - 3, ".tr"))
        short_name[len - 3] = '\0';
    }
    int fontinfo_id;
    FontInfo fontinfo;
    fontinfo.name = short_name;
    fontinfo.properties = 0;  // Not used to lookup in the table
    if (!classify.get_fontinfo_table().contains(fontinfo)) {
      fontinfo_id = classify.get_fontinfo_table().push_back(fontinfo);
      printf("%s has no defined properties.\n", short_name);
    } else {
      fontinfo_id = classify.get_fontinfo_table().get_id(fontinfo);
      // Update the properties field
      fontinfo = classify.get_fontinfo_table().get(fontinfo_id);
      delete[] short_name;
    }
    TrainingPage = Efopen (PageName, "r");
    CharList = ReadTrainingSamples (TrainingPage);
    fclose (TrainingPage);
    //WriteTrainingSamples (Directory, CharList);
    pCharList = CharList;
    iterate(pCharList) {
      //Cluster
      CharSample = (LABELEDLIST) first_node (pCharList);
//    printf ("\nClustering %s ...", CharSample->Label);
      Clusterer = SetUpForClustering(CharSample, PROGRAM_FEATURE_TYPE);
      Config.MagicSamples = CharSample->SampleCount;
      ProtoList = ClusterSamples(Clusterer, &Config);
      CleanUpUnusedData(ProtoList);

      //Merge
      MergeInsignificantProtos(ProtoList, CharSample->Label,
                               Clusterer, &Config);
      if (strcmp(test_ch, CharSample->Label) == 0)
        DisplayProtoList(test_ch, ProtoList);
      ProtoList = RemoveInsignificantProtos(ProtoList, ShowSignificantProtos,
                                            ShowInsignificantProtos,
                                            Clusterer->SampleSize);
      FreeClusterer(Clusterer);
      MergeClass = FindClass (ClassList, CharSample->Label);
      if (MergeClass == NULL) {
        MergeClass = NewLabeledClass (CharSample->Label);
        ClassList = push (ClassList, MergeClass);
      }
      Cid = AddConfigToClass(MergeClass->Class);
      MergeClass->Class->font_set.push_back(fontinfo_id);
      pProtoList = ProtoList;
      iterate (pProtoList) {
        Prototype = (PROTOTYPE *) first_node (pProtoList);

        // see if proto can be approximated by existing proto
        Pid = FindClosestExistingProto(MergeClass->Class,
                                       MergeClass->NumMerged, Prototype);
        if (Pid == NO_PROTO) {
          Pid = AddProtoToClass (MergeClass->Class);
          Proto = ProtoIn (MergeClass->Class, Pid);
          MakeNewFromOld (Proto, Prototype);
          MergeClass->NumMerged[Pid] = 1;
        }
        else {
          MakeNewFromOld (&DummyProto, Prototype);
          ComputeMergedProto (ProtoIn (MergeClass->Class, Pid), &DummyProto,
              (FLOAT32) MergeClass->NumMerged[Pid], 1.0,
              ProtoIn (MergeClass->Class, Pid));
          MergeClass->NumMerged[Pid] ++;
        }
        Config2 = MergeClass->Class->Configurations[Cid];
        AddProtoToConfig (Pid, Config2);
      }
      FreeProtoList (&ProtoList);
    }
    FreeTrainingSamples (CharList);
  }
  //WriteMergedTrainingSamples(Directory,ClassList);
  WriteMicrofeat(Directory, ClassList);
  SetUpForFloat2Int(ClassList);
  IntTemplates = classify.CreateIntTemplates(TrainingData,
                                             unicharset_training);
  strcpy (Filename, "");
  if (Directory != NULL) {
    strcat (Filename, Directory);
    strcat (Filename, "/");
  }
  strcat (Filename, "inttemp");
#ifdef __UNIX__
  OutFile = Efopen (Filename, "w");
#else
  OutFile = Efopen (Filename, "wb");
#endif
  classify.WriteIntTemplates(OutFile, IntTemplates, unicharset_training);
  fclose (OutFile);
  strcpy (Filename, "");
  if (Directory != NULL) {
    strcat (Filename, Directory);
    strcat (Filename, "/");
  }
  strcat (Filename, "pffmtable");
  // Now create pffmtable.
  WritePFFMTable(IntTemplates, Filename);
  // Write updated unicharset to a file.
  if (!unicharset_training.save_to_file(OutputUnicharsetFile)) {
    fprintf(stderr, "Failed to save unicharset to file %s\n",
            OutputUnicharsetFile);
    exit(1);
  }
  printf ("Done!\n"); /**/
  FreeLabeledClassList (ClassList);
  return 0;
}	/* main */
/*---------------------------------------------------------------------------*/
int main (
     int  argc,
     char  **argv)

/*
**  Parameters:
**    argc  number of command line arguments
**    argv  array of command line arguments
**  Globals: none
**  Operation:
**    This program reads in a text file consisting of feature
**    samples from a training page in the following format:
**
**      FontName CharName NumberOfFeatureTypes(N)
**         FeatureTypeName1 NumberOfFeatures(M)
**            Feature1
**            ...
**            FeatureM
**         FeatureTypeName2 NumberOfFeatures(M)
**            Feature1
**            ...
**            FeatureM
**         ...
**         FeatureTypeNameN NumberOfFeatures(M)
**            Feature1
**            ...
**            FeatureM
**      FontName CharName ...
**
**    It then appends these samples into a separate file for each
**    character.  The name of the file is
**
**      DirectoryName/FontName/CharName.FeatureTypeName
**
**    The DirectoryName can be specified via a command
**    line argument.  If not specified, it defaults to the
**    current directory.  The format of the resulting files is:
**
**      NumberOfFeatures(M)
**         Feature1
**         ...
**         FeatureM
**      NumberOfFeatures(M)
**      ...
**
**    The output files each have a header which describes the
**    type of feature which the file contains.  This header is
**    in the format required by the clusterer.  A command line
**    argument can also be used to specify that only the first
**    N samples of each class should be used.
**  Return: none
**  Exceptions: none
**  History: Fri Aug 18 08:56:17 1989, DSJ, Created.
*/

{
  char  *PageName;
  FILE  *TrainingPage;
  LIST  CharList = NIL_LIST;
  CLUSTERER  *Clusterer = NULL;
  LIST    ProtoList = NIL_LIST;
  LIST    NormProtoList = NIL_LIST;
  LIST pCharList;
  LABELEDLIST CharSample;
  FEATURE_DEFS_STRUCT FeatureDefs;
  InitFeatureDefs(&FeatureDefs);

  ParseArguments(argc, argv);
  int num_fonts = 0;
  while ((PageName = GetNextFilename(argc, argv)) != NULL) {
    printf("Reading %s ...\n", PageName);
    TrainingPage = Efopen(PageName, "r");
    ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE,
                        100, 1.0f / 64.0f, 0.0f, NULL, TrainingPage, &CharList);
    fclose(TrainingPage);
    ++num_fonts;
  }
  printf("Clustering ...\n");
  // To allow an individual font to form a separate cluster,
  // reduce the min samples:
  // Config.MinSamples = 0.5 / num_fonts;
  pCharList = CharList;
  iterate(pCharList) {
    //Cluster
    CharSample = (LABELEDLIST)first_node(pCharList);
    Clusterer =
      SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
    float SavedMinSamples = Config.MinSamples;
    // To disable the tendency to produce a single cluster for all fonts,
    // make MagicSamples an impossible to achieve number:
    // Config.MagicSamples = CharSample->SampleCount * 10;
    Config.MagicSamples = CharSample->SampleCount;
    while (Config.MinSamples > 0.001) {
      ProtoList = ClusterSamples(Clusterer, &Config);
      if (NumberOfProtos(ProtoList, 1, 0) > 0) {
        break;
      } else {
        Config.MinSamples *= 0.95;
        printf("0 significant protos for %s."
               " Retrying clustering with MinSamples = %f%%\n",
               CharSample->Label, Config.MinSamples);
      }
    }
    Config.MinSamples = SavedMinSamples;
    AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
  }
  FreeTrainingSamples(CharList);
  if (Clusterer == NULL) // To avoid a SIGSEGV
    return 1;
  WriteNormProtos (Directory, NormProtoList, Clusterer);
  FreeClusterer(Clusterer);
  FreeProtoList(&ProtoList);
  FreeNormProtoList(NormProtoList);
  printf ("\n");
  return 0;
}  // main
/*---------------------------------------------------------------------------*/
void WriteTrainingSamples (
    char	*Directory,
    LIST	CharList,
    const char* program_feature_type)

/*
 **	Parameters:
 **		Directory	directory to place sample files into
 **		FontList	list of fonts used in the training samples
 **	Operation:
 **		This routine writes the specified samples into files which
 **		are organized according to the font name and character name
 **		of the samples.
 **	Return: none
 **	Exceptions: none
 **	History: Fri Aug 18 16:17:06 1989, DSJ, Created.
 */

{
  LABELEDLIST	CharSample;
  FEATURE_SET	FeatureSet;
  LIST		FeatureList;
  FILE		*File;
  char		Filename[MAXNAMESIZE];
  int		NumSamples;

  iterate (CharList)		// iterate thru all of the fonts
  {
    CharSample = (LABELEDLIST) first_node (CharList);

    // construct the full pathname for the current samples file
    strcpy (Filename, "");
    if (Directory != NULL)
    {
      strcat (Filename, Directory);
      strcat (Filename, "/");
    }
    strcat (Filename, CTFontName);
    strcat (Filename, "/");
    strcat (Filename, CharSample->Label);
    strcat (Filename, ".");
    strcat (Filename, program_feature_type);
    printf ("\nWriting %s ...", Filename);

    /* if file does not exist, create a new one with an appropriate
       header; otherwise append samples to the existing file */
    File = fopen (Filename, "r");
    if (File == NULL)
    {
      File = Efopen (Filename, "w");
      WriteOldParamDesc(
          File,
          FeatureDefs.FeatureDesc[ShortNameToFeatureType(
              program_feature_type)]);
    }
    else
    {
      fclose (File);
      File = Efopen (Filename, "a");
    }

    // append samples onto the file
    FeatureList = CharSample->List;
    NumSamples = 0;
    iterate (FeatureList)
    {
      FeatureSet = (FEATURE_SET) first_node (FeatureList);
      WriteFeatureSet (File, FeatureSet);
      NumSamples++;
    }
    fclose (File);
  }
}	/* WriteTrainingSamples */