예제 #1
0
/*---------------------------------------------------------------------------*/
int main (int argc, char **argv) {
/*
**  Parameters:
**    argc  number of command line arguments
**    argv  array of command line arguments
**  Globals: none
**  Operation:
**    This program reads in a text file consisting of feature
**    samples from a training page in the following format:
**
**      FontName UTF8-char-str xmin ymin xmax ymax page-number
**       NumberOfFeatureTypes(N)
**         FeatureTypeName1 NumberOfFeatures(M)
**            Feature1
**            ...
**            FeatureM
**         FeatureTypeName2 NumberOfFeatures(M)
**            Feature1
**            ...
**            FeatureM
**         ...
**         FeatureTypeNameN NumberOfFeatures(M)
**            Feature1
**            ...
**            FeatureM
**      FontName CharName ...
**
**    The result of this program is a binary inttemp file used by
**    the OCR engine.
**  Return: none
**  Exceptions: none
**  History:  Fri Aug 18 08:56:17 1989, DSJ, Created.
**        Mon May 18 1998, Christy Russson, Revistion started.
*/
  ParseArguments(&argc, &argv);

  ShapeTable* shape_table = NULL;
  STRING file_prefix;
  // Load the training data.
  MasterTrainer* trainer = tesseract::LoadTrainingData(argc, argv,
                                                       false,
                                                       &shape_table,
                                                       &file_prefix);
  if (trainer == NULL)
    return 1;  // Failed.

  // Setup an index mapping from the shapes in the shape table to the classes
  // that will be trained. In keeping with the original design, each shape
  // with the same list of unichars becomes a different class and the configs
  // represent the different combinations of fonts.
  IndexMapBiDi config_map;
  SetupConfigMap(shape_table, &config_map);

  WriteShapeTable(file_prefix, *shape_table);
  // If the shape_table is flat, then either we didn't run shape clustering, or
  // it did nothing, so we just output the trainer's unicharset.
  // Otherwise shape_set will hold a fake unicharset with an entry for each
  // shape in the shape table, and we will output that instead.
  UNICHARSET shape_set;
  const UNICHARSET* unicharset = &trainer->unicharset();
  // If we ran shapeclustering (and it worked) then at least one shape will
  // have multiple unichars, so we have to build a fake unicharset.
  if (shape_table->AnyMultipleUnichars()) {
    unicharset = &shape_set;
    // Now build a fake unicharset for the compact shape space to keep the
    // output modules happy that we are doing things correctly.
    int num_shapes = config_map.CompactSize();
    for (int s = 0; s < num_shapes; ++s) {
      char shape_label[kMaxShapeLabelLength + 1];
      snprintf(shape_label, kMaxShapeLabelLength, "sh%04d", s);
      shape_set.unichar_insert(shape_label);
    }
  }

  // Now train each config separately.
  int num_configs = shape_table->NumShapes();
  LIST mf_classes = NIL_LIST;
  for (int s = 0; s < num_configs; ++s) {
    int unichar_id, font_id;
    if (unicharset == &shape_set) {
      // Using fake unichar_ids from the config_map/shape_set.
      unichar_id = config_map.SparseToCompact(s);
    } else {
      // Get the real unichar_id from the shape table/unicharset.
      shape_table->GetFirstUnicharAndFont(s, &unichar_id, &font_id);
    }
    const char* class_label = unicharset->id_to_unichar(unichar_id);
    mf_classes = ClusterOneConfig(s, class_label, mf_classes, *shape_table,
                                  trainer);
  }
  STRING inttemp_file = file_prefix;
  inttemp_file += "inttemp";
  STRING pffmtable_file = file_prefix;
  pffmtable_file += "pffmtable";
  CLASS_STRUCT* float_classes = SetUpForFloat2Int(*unicharset, mf_classes);
  // Now write the inttemp and pffmtable.
  trainer->WriteInttempAndPFFMTable(trainer->unicharset(), *unicharset,
                                    *shape_table, float_classes,
                                    inttemp_file.string(),
                                    pffmtable_file.string());
  delete [] float_classes;
  FreeLabeledClassList(mf_classes);
  delete trainer;
  delete shape_table;
  printf("Done!\n");
  if (!FLAGS_test_ch.empty()) {
    // If we are displaying debug window(s), wait for the user to look at them.
    printf("Hit return to exit...\n");
    while (getchar() != '\n');
  }
  return 0;
}  /* main */
예제 #2
0
/**
 * This program reads in a text file consisting of feature
 * samples from a training page in the following format:
 * @verbatim
      FontName UTF8-char-str xmin ymin xmax ymax page-number
       NumberOfFeatureTypes(N)
         FeatureTypeName1 NumberOfFeatures(M)
            Feature1
            ...
            FeatureM
         FeatureTypeName2 NumberOfFeatures(M)
            Feature1
            ...
            FeatureM
         ...
         FeatureTypeNameN NumberOfFeatures(M)
            Feature1
            ...
            FeatureM
      FontName CharName ...
    @endverbatim
 * The result of this program is a binary inttemp file used by
 * the OCR engine.
 * @param  argc  number of command line arguments
 * @param  argv  array of command line arguments
 * @return none
 * @note Exceptions: none
 * @note History:  Fri Aug 18 08:56:17 1989, DSJ, Created.
 * @note History: Mon May 18 1998, Christy Russson, Revistion started.
 */
int main (int argc, char **argv) {
  ParseArguments(&argc, &argv);

  ShapeTable* shape_table = NULL;
  STRING file_prefix;
  // Load the training data.
  MasterTrainer* trainer = tesseract::LoadTrainingData(argc, argv,
                                                       false,
                                                       &shape_table,
                                                       &file_prefix);
  if (trainer == NULL)
    return 1;  // Failed.

  // Setup an index mapping from the shapes in the shape table to the classes
  // that will be trained. In keeping with the original design, each shape
  // with the same list of unichars becomes a different class and the configs
  // represent the different combinations of fonts.
  IndexMapBiDi config_map;
  SetupConfigMap(shape_table, &config_map);

  WriteShapeTable(file_prefix, *shape_table);
  // If the shape_table is flat, then either we didn't run shape clustering, or
  // it did nothing, so we just output the trainer's unicharset.
  // Otherwise shape_set will hold a fake unicharset with an entry for each
  // shape in the shape table, and we will output that instead.
  UNICHARSET shape_set;
  const UNICHARSET* unicharset = &trainer->unicharset();
  // If we ran shapeclustering (and it worked) then at least one shape will
  // have multiple unichars, so we have to build a fake unicharset.
  if (shape_table->AnyMultipleUnichars()) {
    unicharset = &shape_set;
    // Now build a fake unicharset for the compact shape space to keep the
    // output modules happy that we are doing things correctly.
    int num_shapes = config_map.CompactSize();
    for (int s = 0; s < num_shapes; ++s) {
      char shape_label[kMaxShapeLabelLength + 1];
      snprintf(shape_label, kMaxShapeLabelLength, "sh%04d", s);
      shape_set.unichar_insert(shape_label);
    }
  }

  // Now train each config separately.
  int num_configs = shape_table->NumShapes();
  LIST mf_classes = NIL_LIST;
  for (int s = 0; s < num_configs; ++s) {
    int unichar_id, font_id;
    if (unicharset == &shape_set) {
      // Using fake unichar_ids from the config_map/shape_set.
      unichar_id = config_map.SparseToCompact(s);
    } else {
      // Get the real unichar_id from the shape table/unicharset.
      shape_table->GetFirstUnicharAndFont(s, &unichar_id, &font_id);
    }
    const char* class_label = unicharset->id_to_unichar(unichar_id);
    mf_classes = ClusterOneConfig(s, class_label, mf_classes, *shape_table,
                                  trainer);
  }
  STRING inttemp_file = file_prefix;
  inttemp_file += "inttemp";
  STRING pffmtable_file = file_prefix;
  pffmtable_file += "pffmtable";
  CLASS_STRUCT* float_classes = SetUpForFloat2Int(*unicharset, mf_classes);
  // Now write the inttemp and pffmtable.
  trainer->WriteInttempAndPFFMTable(trainer->unicharset(), *unicharset,
                                    *shape_table, float_classes,
                                    inttemp_file.string(),
                                    pffmtable_file.string());
  for (int c = 0; c < unicharset->size(); ++c) {
    FreeClassFields(&float_classes[c]);
  }
  delete [] float_classes;
  FreeLabeledClassList(mf_classes);
  delete trainer;
  delete shape_table;
  printf("Done!\n");
  if (!FLAGS_test_ch.empty()) {
    // If we are displaying debug window(s), wait for the user to look at them.
    printf("Hit return to exit...\n");
    while (getchar() != '\n');
  }
  return 0;
}  /* main */
예제 #3
0
/**
 * Creates a MasterTraininer and loads the training data into it:
 * Initializes feature_defs and IntegerFX.
 * Loads the shape_table if shape_table != nullptr.
 * Loads initial unicharset from -U command-line option.
 * If FLAGS_T is set, loads the majority of data from there, else:
 *  - Loads font info from -F option.
 *  - Loads xheights from -X option.
 *  - Loads samples from .tr files in remaining command-line args.
 *  - Deletes outliers and computes canonical samples.
 *  - If FLAGS_output_trainer is set, saves the trainer for future use.
 * Computes canonical and cloud features.
 * If shape_table is not nullptr, but failed to load, make a fake flat one,
 * as shape clustering was not run.
 */
MasterTrainer* LoadTrainingData(int argc, const char* const * argv,
                                bool replication,
                                ShapeTable** shape_table,
                                STRING* file_prefix) {
  InitFeatureDefs(&feature_defs);
  InitIntegerFX();
  *file_prefix = "";
  if (!FLAGS_D.empty()) {
    *file_prefix += FLAGS_D.c_str();
    *file_prefix += "/";
  }
  // If we are shape clustering (nullptr shape_table) or we successfully load
  // a shape_table written by a previous shape clustering, then
  // shape_analysis will be true, meaning that the MasterTrainer will replace
  // some members of the unicharset with their fragments.
  bool shape_analysis = false;
  if (shape_table != nullptr) {
    *shape_table = LoadShapeTable(*file_prefix);
    if (*shape_table != nullptr) shape_analysis = true;
  } else {
    shape_analysis = true;
  }
  MasterTrainer* trainer = new MasterTrainer(NM_CHAR_ANISOTROPIC,
                                             shape_analysis,
                                             replication,
                                             FLAGS_debug_level);
  IntFeatureSpace fs;
  fs.Init(kBoostXYBuckets, kBoostXYBuckets, kBoostDirBuckets);
  if (FLAGS_T.empty()) {
    trainer->LoadUnicharset(FLAGS_U.c_str());
    // Get basic font information from font_properties.
    if (!FLAGS_F.empty()) {
      if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {
        delete trainer;
        return nullptr;
      }
    }
    if (!FLAGS_X.empty()) {
      if (!trainer->LoadXHeights(FLAGS_X.c_str())) {
        delete trainer;
        return nullptr;
      }
    }
    trainer->SetFeatureSpace(fs);
    const char* page_name;
    // Load training data from .tr files on the command line.
    while ((page_name = GetNextFilename(argc, argv)) != nullptr) {
      tprintf("Reading %s ...\n", page_name);
      trainer->ReadTrainingSamples(page_name, feature_defs, false);

      // If there is a file with [lang].[fontname].exp[num].fontinfo present,
      // read font spacing information in to fontinfo_table.
      int pagename_len = strlen(page_name);
      char *fontinfo_file_name = new char[pagename_len + 7];
      strncpy(fontinfo_file_name, page_name, pagename_len - 2);  // remove "tr"
      strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo");  // +"fontinfo"
      trainer->AddSpacingInfo(fontinfo_file_name);
      delete[] fontinfo_file_name;

      // Load the images into memory if required by the classifier.
      if (FLAGS_load_images) {
        STRING image_name = page_name;
        // Chop off the tr and replace with tif. Extension must be tif!
        image_name.truncate_at(image_name.length() - 2);
        image_name += "tif";
        trainer->LoadPageImages(image_name.string());
      }
    }
    trainer->PostLoadCleanup();
    // Write the master trainer if required.
    if (!FLAGS_output_trainer.empty()) {
      FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb");
      if (fp == nullptr) {
        tprintf("Can't create saved trainer data!\n");
      } else {
        trainer->Serialize(fp);
        fclose(fp);
      }
    }
  } else {
    bool success = false;
    tprintf("Loading master trainer from file:%s\n",
            FLAGS_T.c_str());
    FILE* fp = fopen(FLAGS_T.c_str(), "rb");
    if (fp == nullptr) {
      tprintf("Can't read file %s to initialize master trainer\n",
              FLAGS_T.c_str());
    } else {
      success = trainer->DeSerialize(false, fp);
      fclose(fp);
    }
    if (!success) {
      tprintf("Deserialize of master trainer failed!\n");
      delete trainer;
      return nullptr;
    }
    trainer->SetFeatureSpace(fs);
  }
  trainer->PreTrainingSetup();
  if (!FLAGS_O.empty() &&
      !trainer->unicharset().save_to_file(FLAGS_O.c_str())) {
    fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str());
    delete trainer;
    return nullptr;
  }
  if (shape_table != nullptr) {
    // If we previously failed to load a shapetable, then shape clustering
    // wasn't run so make a flat one now.
    if (*shape_table == nullptr) {
      *shape_table = new ShapeTable;
      trainer->SetupFlatShapeTable(*shape_table);
      tprintf("Flat shape table summary: %s\n",
              (*shape_table)->SummaryStr().string());
    }
    (*shape_table)->set_unicharset(trainer->unicharset());
  }
  return trainer;
}