Exemple #1
0
/**
 * @name program_editup
 *
 * Initialize all the things in the program that need to be initialized.
 * init_permute determines whether to initialize the permute functions
 * and Dawg models.
 */
    void Wordrec::program_editup(const char *textbase,
                                 bool init_classifier,
                                 bool init_dict) {
        if (textbase != NULL) imagefile = textbase;
        InitFeatureDefs(&feature_defs_);
        InitAdaptiveClassifier(init_classifier);
        if (init_dict) getDict().Load(Dict::GlobalDawgCache());
        pass2_ok_split = chop_ok_split;
    }
Exemple #2
0
/**
 * @name program_editup
 *
 * Initialize all the things in the program that need to be initialized.
 * init_permute determines whether to initialize the permute functions
 * and Dawg models.
 */
void Wordrec::program_editup(const char *textbase,
                             bool init_classifier,
                             bool init_dict) {
  if (textbase != NULL) imagefile = textbase;
  InitFeatureDefs(&feature_defs_);
  SetupExtractors(&feature_defs_);
  InitAdaptiveClassifier(init_classifier);
  if (init_dict) getDict().Load();
  pass2_ok_split = chop_ok_split;
  pass2_seg_states = wordrec_num_seg_states;
}
Exemple #3
0
/**
 * @name program_editup
 *
 * Initialize all the things in the program that need to be initialized.
 * init_permute determines whether to initialize the permute functions
 * and Dawg models.
 */
void Wordrec::program_editup(const char *textbase,
                             TessdataManager *init_classifier,
                             TessdataManager *init_dict) {
  if (textbase != nullptr) imagefile = textbase;
  InitFeatureDefs(&feature_defs_);
  InitAdaptiveClassifier(init_classifier);
  if (init_dict) {
    getDict().SetupForLoad(Dict::GlobalDawgCache());
    getDict().Load(lang, init_dict);
    getDict().FinishLoad();
  }
  pass2_ok_split = chop_ok_split;
}
/*---------------------------------------------------------------------------*/
int main (
     int  argc,
     char  **argv)

/*
**  Parameters:
**    argc  number of command line arguments
**    argv  array of command line arguments
**  Globals: none
**  Operation:
**    This program reads in a text file consisting of feature
**    samples from a training page in the following format:
**
**      FontName CharName NumberOfFeatureTypes(N)
**         FeatureTypeName1 NumberOfFeatures(M)
**            Feature1
**            ...
**            FeatureM
**         FeatureTypeName2 NumberOfFeatures(M)
**            Feature1
**            ...
**            FeatureM
**         ...
**         FeatureTypeNameN NumberOfFeatures(M)
**            Feature1
**            ...
**            FeatureM
**      FontName CharName ...
**
**    It then appends these samples into a separate file for each
**    character.  The name of the file is
**
**      DirectoryName/FontName/CharName.FeatureTypeName
**
**    The DirectoryName can be specified via a command
**    line argument.  If not specified, it defaults to the
**    current directory.  The format of the resulting files is:
**
**      NumberOfFeatures(M)
**         Feature1
**         ...
**         FeatureM
**      NumberOfFeatures(M)
**      ...
**
**    The output files each have a header which describes the
**    type of feature which the file contains.  This header is
**    in the format required by the clusterer.  A command line
**    argument can also be used to specify that only the first
**    N samples of each class should be used.
**  Return: none
**  Exceptions: none
**  History: Fri Aug 18 08:56:17 1989, DSJ, Created.
*/

{
  char  *PageName;
  FILE  *TrainingPage;
  LIST  CharList = NIL_LIST;
  CLUSTERER  *Clusterer = NULL;
  LIST    ProtoList = NIL_LIST;
  LIST    NormProtoList = NIL_LIST;
  LIST pCharList;
  LABELEDLIST CharSample;
  FEATURE_DEFS_STRUCT FeatureDefs;
  InitFeatureDefs(&FeatureDefs);

  ParseArguments(argc, argv);
  int num_fonts = 0;
  while ((PageName = GetNextFilename(argc, argv)) != NULL) {
    printf("Reading %s ...\n", PageName);
    TrainingPage = Efopen(PageName, "r");
    ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE,
                        100, 1.0f / 64.0f, 0.0f, NULL, TrainingPage, &CharList);
    fclose(TrainingPage);
    ++num_fonts;
  }
  printf("Clustering ...\n");
  // To allow an individual font to form a separate cluster,
  // reduce the min samples:
  // Config.MinSamples = 0.5 / num_fonts;
  pCharList = CharList;
  iterate(pCharList) {
    //Cluster
    CharSample = (LABELEDLIST)first_node(pCharList);
    Clusterer =
      SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
    float SavedMinSamples = Config.MinSamples;
    // To disable the tendency to produce a single cluster for all fonts,
    // make MagicSamples an impossible to achieve number:
    // Config.MagicSamples = CharSample->SampleCount * 10;
    Config.MagicSamples = CharSample->SampleCount;
    while (Config.MinSamples > 0.001) {
      ProtoList = ClusterSamples(Clusterer, &Config);
      if (NumberOfProtos(ProtoList, 1, 0) > 0) {
        break;
      } else {
        Config.MinSamples *= 0.95;
        printf("0 significant protos for %s."
               " Retrying clustering with MinSamples = %f%%\n",
               CharSample->Label, Config.MinSamples);
      }
    }
    Config.MinSamples = SavedMinSamples;
    AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
  }
  FreeTrainingSamples(CharList);
  if (Clusterer == NULL) // To avoid a SIGSEGV
    return 1;
  WriteNormProtos (Directory, NormProtoList, Clusterer);
  FreeClusterer(Clusterer);
  FreeProtoList(&ProtoList);
  FreeNormProtoList(NormProtoList);
  printf ("\n");
  return 0;
}  // main
/**
 * Creates a MasterTraininer and loads the training data into it:
 * Initializes feature_defs and IntegerFX.
 * Loads the shape_table if shape_table != nullptr.
 * Loads initial unicharset from -U command-line option.
 * If FLAGS_T is set, loads the majority of data from there, else:
 *  - Loads font info from -F option.
 *  - Loads xheights from -X option.
 *  - Loads samples from .tr files in remaining command-line args.
 *  - Deletes outliers and computes canonical samples.
 *  - If FLAGS_output_trainer is set, saves the trainer for future use.
 * Computes canonical and cloud features.
 * If shape_table is not nullptr, but failed to load, make a fake flat one,
 * as shape clustering was not run.
 */
MasterTrainer* LoadTrainingData(int argc, const char* const * argv,
                                bool replication,
                                ShapeTable** shape_table,
                                STRING* file_prefix) {
  InitFeatureDefs(&feature_defs);
  InitIntegerFX();
  *file_prefix = "";
  if (!FLAGS_D.empty()) {
    *file_prefix += FLAGS_D.c_str();
    *file_prefix += "/";
  }
  // If we are shape clustering (nullptr shape_table) or we successfully load
  // a shape_table written by a previous shape clustering, then
  // shape_analysis will be true, meaning that the MasterTrainer will replace
  // some members of the unicharset with their fragments.
  bool shape_analysis = false;
  if (shape_table != nullptr) {
    *shape_table = LoadShapeTable(*file_prefix);
    if (*shape_table != nullptr) shape_analysis = true;
  } else {
    shape_analysis = true;
  }
  MasterTrainer* trainer = new MasterTrainer(NM_CHAR_ANISOTROPIC,
                                             shape_analysis,
                                             replication,
                                             FLAGS_debug_level);
  IntFeatureSpace fs;
  fs.Init(kBoostXYBuckets, kBoostXYBuckets, kBoostDirBuckets);
  if (FLAGS_T.empty()) {
    trainer->LoadUnicharset(FLAGS_U.c_str());
    // Get basic font information from font_properties.
    if (!FLAGS_F.empty()) {
      if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {
        delete trainer;
        return nullptr;
      }
    }
    if (!FLAGS_X.empty()) {
      if (!trainer->LoadXHeights(FLAGS_X.c_str())) {
        delete trainer;
        return nullptr;
      }
    }
    trainer->SetFeatureSpace(fs);
    const char* page_name;
    // Load training data from .tr files on the command line.
    while ((page_name = GetNextFilename(argc, argv)) != nullptr) {
      tprintf("Reading %s ...\n", page_name);
      trainer->ReadTrainingSamples(page_name, feature_defs, false);

      // If there is a file with [lang].[fontname].exp[num].fontinfo present,
      // read font spacing information in to fontinfo_table.
      int pagename_len = strlen(page_name);
      char *fontinfo_file_name = new char[pagename_len + 7];
      strncpy(fontinfo_file_name, page_name, pagename_len - 2);  // remove "tr"
      strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo");  // +"fontinfo"
      trainer->AddSpacingInfo(fontinfo_file_name);
      delete[] fontinfo_file_name;

      // Load the images into memory if required by the classifier.
      if (FLAGS_load_images) {
        STRING image_name = page_name;
        // Chop off the tr and replace with tif. Extension must be tif!
        image_name.truncate_at(image_name.length() - 2);
        image_name += "tif";
        trainer->LoadPageImages(image_name.string());
      }
    }
    trainer->PostLoadCleanup();
    // Write the master trainer if required.
    if (!FLAGS_output_trainer.empty()) {
      FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb");
      if (fp == nullptr) {
        tprintf("Can't create saved trainer data!\n");
      } else {
        trainer->Serialize(fp);
        fclose(fp);
      }
    }
  } else {
    bool success = false;
    tprintf("Loading master trainer from file:%s\n",
            FLAGS_T.c_str());
    FILE* fp = fopen(FLAGS_T.c_str(), "rb");
    if (fp == nullptr) {
      tprintf("Can't read file %s to initialize master trainer\n",
              FLAGS_T.c_str());
    } else {
      success = trainer->DeSerialize(false, fp);
      fclose(fp);
    }
    if (!success) {
      tprintf("Deserialize of master trainer failed!\n");
      delete trainer;
      return nullptr;
    }
    trainer->SetFeatureSpace(fs);
  }
  trainer->PreTrainingSetup();
  if (!FLAGS_O.empty() &&
      !trainer->unicharset().save_to_file(FLAGS_O.c_str())) {
    fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str());
    delete trainer;
    return nullptr;
  }
  if (shape_table != nullptr) {
    // If we previously failed to load a shapetable, then shape clustering
    // wasn't run so make a flat one now.
    if (*shape_table == nullptr) {
      *shape_table = new ShapeTable;
      trainer->SetupFlatShapeTable(*shape_table);
      tprintf("Flat shape table summary: %s\n",
              (*shape_table)->SummaryStr().string());
    }
    (*shape_table)->set_unicharset(trainer->unicharset());
  }
  return trainer;
}
Exemple #6
0
/**
* This program reads in a text file consisting of feature
* samples from a training page in the following format:
* @verbatim
   FontName CharName NumberOfFeatureTypes(N)
      FeatureTypeName1 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      FeatureTypeName2 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      ...
      FeatureTypeNameN NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
   FontName CharName ...
@endverbatim
* It then appends these samples into a separate file for each
* character.  The name of the file is
*
*   DirectoryName/FontName/CharName.FeatureTypeName
*
* The DirectoryName can be specified via a command
* line argument.  If not specified, it defaults to the
* current directory.  The format of the resulting files is:
* @verbatim
   NumberOfFeatures(M)
      Feature1
      ...
      FeatureM
   NumberOfFeatures(M)
   ...
@endverbatim
* The output files each have a header which describes the
* type of feature which the file contains.  This header is
* in the format required by the clusterer.  A command line
* argument can also be used to specify that only the first
* N samples of each class should be used.
* @param argc  number of command line arguments
* @param argv  array of command line arguments
* @return none
* @note Globals: none
* @note Exceptions: none
* @note History: Fri Aug 18 08:56:17 1989, DSJ, Created.
*/
int main(int argc, char *argv[]) {
  // Set the global Config parameters before parsing the command line.
  Config = CNConfig;

  const char  *PageName;
  FILE  *TrainingPage;
  LIST  CharList = NIL_LIST;
  CLUSTERER  *Clusterer = NULL;
  LIST    ProtoList = NIL_LIST;
  LIST    NormProtoList = NIL_LIST;
  LIST pCharList;
  LABELEDLIST CharSample;
  FEATURE_DEFS_STRUCT FeatureDefs;
  InitFeatureDefs(&FeatureDefs);

  ParseArguments(&argc, &argv);
  int num_fonts = 0;
  while ((PageName = GetNextFilename(argc, argv)) != NULL) {
    printf("Reading %s ...\n", PageName);
    TrainingPage = Efopen(PageName, "rb");
    ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE,
                        100, NULL, TrainingPage, &CharList);
    fclose(TrainingPage);
    ++num_fonts;
  }
  printf("Clustering ...\n");
  // To allow an individual font to form a separate cluster,
  // reduce the min samples:
  // Config.MinSamples = 0.5 / num_fonts;
  pCharList = CharList;
  // The norm protos will count the source protos, so we keep them here in
  // freeable_protos, so they can be freed later.
  GenericVector<LIST> freeable_protos;
  iterate(pCharList) {
    //Cluster
    CharSample = (LABELEDLIST)first_node(pCharList);
    Clusterer =
      SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
    if (Clusterer == NULL) {  // To avoid a SIGSEGV
      fprintf(stderr, "Error: NULL clusterer!\n");
      return 1;
    }
    float SavedMinSamples = Config.MinSamples;
    // To disable the tendency to produce a single cluster for all fonts,
    // make MagicSamples an impossible to achieve number:
    // Config.MagicSamples = CharSample->SampleCount * 10;
    Config.MagicSamples = CharSample->SampleCount;
    while (Config.MinSamples > 0.001) {
      ProtoList = ClusterSamples(Clusterer, &Config);
      if (NumberOfProtos(ProtoList, 1, 0) > 0) {
        break;
      } else {
        Config.MinSamples *= 0.95;
        printf("0 significant protos for %s."
               " Retrying clustering with MinSamples = %f%%\n",
               CharSample->Label, Config.MinSamples);
      }
    }
    Config.MinSamples = SavedMinSamples;
    AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
    freeable_protos.push_back(ProtoList);
    FreeClusterer(Clusterer);
  }
  FreeTrainingSamples(CharList);
  int desc_index = ShortNameToFeatureType(FeatureDefs, PROGRAM_FEATURE_TYPE);
  WriteNormProtos(FLAGS_D.c_str(), NormProtoList,
                  FeatureDefs.FeatureDesc[desc_index]);
  FreeNormProtoList(NormProtoList);
  for (int i = 0; i < freeable_protos.size(); ++i) {
    FreeProtoList(&freeable_protos[i]);
  }
  printf ("\n");
  return 0;
}  // main