// Copies the given feature_space and uses it as the index feature map // from INT_FEATURE_STRUCT. void IntFeatureMap::Init(const IntFeatureSpace& feature_space) { feature_space_ = feature_space; mapping_changed_ = false; int sparse_size = feature_space_.Size(); feature_map_.Init(sparse_size, true); feature_map_.Setup(); compact_size_ = feature_map_.CompactSize(); // Initialize look-up tables if needed. FCOORD dir = FeatureDirection(0); if (dir.x() == 0.0f && dir.y() == 0.0f) InitIntegerFX(); // Compute look-up tables to generate offset features. for (int dir = 0; dir < kNumOffsetMaps; ++dir) { delete [] offset_plus_[dir]; delete [] offset_minus_[dir]; offset_plus_[dir] = new int[sparse_size]; offset_minus_[dir] = new int[sparse_size]; } for (int dir = 1; dir <= kNumOffsetMaps; ++dir) { for (int i = 0; i < sparse_size; ++i) { int offset_index = ComputeOffsetFeature(i, dir); offset_plus_[dir - 1][i] = offset_index; offset_index = ComputeOffsetFeature(i, -dir); offset_minus_[dir - 1][i] = offset_index; } } }
/** * Creates a MasterTraininer and loads the training data into it: * Initializes feature_defs and IntegerFX. * Loads the shape_table if shape_table != nullptr. * Loads initial unicharset from -U command-line option. * If FLAGS_T is set, loads the majority of data from there, else: * - Loads font info from -F option. * - Loads xheights from -X option. * - Loads samples from .tr files in remaining command-line args. * - Deletes outliers and computes canonical samples. * - If FLAGS_output_trainer is set, saves the trainer for future use. * Computes canonical and cloud features. * If shape_table is not nullptr, but failed to load, make a fake flat one, * as shape clustering was not run. */ MasterTrainer* LoadTrainingData(int argc, const char* const * argv, bool replication, ShapeTable** shape_table, STRING* file_prefix) { InitFeatureDefs(&feature_defs); InitIntegerFX(); *file_prefix = ""; if (!FLAGS_D.empty()) { *file_prefix += FLAGS_D.c_str(); *file_prefix += "/"; } // If we are shape clustering (nullptr shape_table) or we successfully load // a shape_table written by a previous shape clustering, then // shape_analysis will be true, meaning that the MasterTrainer will replace // some members of the unicharset with their fragments. bool shape_analysis = false; if (shape_table != nullptr) { *shape_table = LoadShapeTable(*file_prefix); if (*shape_table != nullptr) shape_analysis = true; } else { shape_analysis = true; } MasterTrainer* trainer = new MasterTrainer(NM_CHAR_ANISOTROPIC, shape_analysis, replication, FLAGS_debug_level); IntFeatureSpace fs; fs.Init(kBoostXYBuckets, kBoostXYBuckets, kBoostDirBuckets); if (FLAGS_T.empty()) { trainer->LoadUnicharset(FLAGS_U.c_str()); // Get basic font information from font_properties. if (!FLAGS_F.empty()) { if (!trainer->LoadFontInfo(FLAGS_F.c_str())) { delete trainer; return nullptr; } } if (!FLAGS_X.empty()) { if (!trainer->LoadXHeights(FLAGS_X.c_str())) { delete trainer; return nullptr; } } trainer->SetFeatureSpace(fs); const char* page_name; // Load training data from .tr files on the command line. while ((page_name = GetNextFilename(argc, argv)) != nullptr) { tprintf("Reading %s ...\n", page_name); trainer->ReadTrainingSamples(page_name, feature_defs, false); // If there is a file with [lang].[fontname].exp[num].fontinfo present, // read font spacing information in to fontinfo_table. int pagename_len = strlen(page_name); char *fontinfo_file_name = new char[pagename_len + 7]; strncpy(fontinfo_file_name, page_name, pagename_len - 2); // remove "tr" strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo"); // +"fontinfo" trainer->AddSpacingInfo(fontinfo_file_name); delete[] fontinfo_file_name; // Load the images into memory if required by the classifier. if (FLAGS_load_images) { STRING image_name = page_name; // Chop off the tr and replace with tif. Extension must be tif! image_name.truncate_at(image_name.length() - 2); image_name += "tif"; trainer->LoadPageImages(image_name.string()); } } trainer->PostLoadCleanup(); // Write the master trainer if required. if (!FLAGS_output_trainer.empty()) { FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb"); if (fp == nullptr) { tprintf("Can't create saved trainer data!\n"); } else { trainer->Serialize(fp); fclose(fp); } } } else { bool success = false; tprintf("Loading master trainer from file:%s\n", FLAGS_T.c_str()); FILE* fp = fopen(FLAGS_T.c_str(), "rb"); if (fp == nullptr) { tprintf("Can't read file %s to initialize master trainer\n", FLAGS_T.c_str()); } else { success = trainer->DeSerialize(false, fp); fclose(fp); } if (!success) { tprintf("Deserialize of master trainer failed!\n"); delete trainer; return nullptr; } trainer->SetFeatureSpace(fs); } trainer->PreTrainingSetup(); if (!FLAGS_O.empty() && !trainer->unicharset().save_to_file(FLAGS_O.c_str())) { fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str()); delete trainer; return nullptr; } if (shape_table != nullptr) { // If we previously failed to load a shapetable, then shape clustering // wasn't run so make a flat one now. if (*shape_table == nullptr) { *shape_table = new ShapeTable; trainer->SetupFlatShapeTable(*shape_table); tprintf("Flat shape table summary: %s\n", (*shape_table)->SummaryStr().string()); } (*shape_table)->set_unicharset(trainer->unicharset()); } return trainer; }