// Returns false if a unicharset file for the specified language was not found // or was invalid. // This function initializes TessdataManager. After TessdataManager is // no longer needed, TessdataManager::End() should be called. // // This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless // it is OEM_DEFAULT, in which case the value of the variable will be obtained // from the language-specific config file (stored in [lang].traineddata), from // the config files specified on the command line or left as the default // OEM_TESSERACT_ONLY if none of the configs specify this variable. bool Tesseract::init_tesseract_lang_data( const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector<STRING> *vars_vec, const GenericVector<STRING> *vars_values, bool set_only_non_debug_params) { // Set the basename, compute the data directory. main_setup(arg0, textbase); // Set the language data path prefix lang = language != NULL ? language : "eng"; language_data_path_prefix = datadir; language_data_path_prefix += lang; language_data_path_prefix += "."; // Initialize TessdataManager. STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix; if (!tessdata_manager.Init(tessdata_path.string(), tessdata_manager_debug_level)) { return false; } // If a language specific config file (lang.config) exists, load it in. if (tessdata_manager.SeekToStart(TESSDATA_LANG_CONFIG)) { ParamUtils::ReadParamsFromFp( tessdata_manager.GetDataFilePtr(), tessdata_manager.GetEndOffset(TESSDATA_LANG_CONFIG), SET_PARAM_CONSTRAINT_NONE, this->params()); if (tessdata_manager_debug_level) { tprintf("Loaded language config file\n"); } } SetParamConstraint set_params_constraint = set_only_non_debug_params ? SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE; // Load tesseract variables from config files. This is done after loading // language-specific variables from [lang].traineddata file, so that custom // config files can override values in [lang].traineddata file. for (int i = 0; i < configs_size; ++i) { read_config_file(configs[i], set_params_constraint); } // Set params specified in vars_vec (done after setting params from config // files, so that params in vars_vec can override those from files). if (vars_vec != NULL && vars_values != NULL) { for (int i = 0; i < vars_vec->size(); ++i) { if (!ParamUtils::SetParam((*vars_vec)[i].string(), (*vars_values)[i].string(), set_params_constraint, this->params())) { tprintf("Error setting param %s\n", (*vars_vec)[i].string()); exit(1); } } } if (((STRING &)tessedit_write_params_to_file).length() > 0) { FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb"); if (params_file != NULL) { ParamUtils::PrintParams(params_file, this->params()); fclose(params_file); if (tessdata_manager_debug_level > 0) { tprintf("Wrote parameters to %s\n", tessedit_write_params_to_file.string()); } } else { tprintf("Failed to open %s for writing params.\n", tessedit_write_params_to_file.string()); } } // Determine which ocr engine(s) should be loaded and used for recognition. if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem); if (tessdata_manager_debug_level) { tprintf("Loading Tesseract/Cube with tessedit_ocr_engine_mode %d\n", static_cast<int>(tessedit_ocr_engine_mode)); } // If we are only loading the config file (and so not planning on doing any // recognition) then there's nothing else do here. if (tessedit_init_config_only) { if (tessdata_manager_debug_level) { tprintf("Returning after loading config file\n"); } return true; } // Load the unicharset if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) || !unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) { return false; } if (unicharset.size() > MAX_NUM_CLASSES) { tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n"); return false; } if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n"); right_to_left_ = unicharset.major_right_to_left(); // Setup initial unichar ambigs table and read universal ambigs. UNICHARSET encoder_unicharset; encoder_unicharset.CopyFrom(unicharset); unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption); unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset); if (!tessedit_ambigs_training && tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) { TFile ambigs_file; ambigs_file.Open(tessdata_manager.GetDataFilePtr(), tessdata_manager.GetEndOffset(TESSDATA_AMBIGS) + 1); unichar_ambigs.LoadUnicharAmbigs( encoder_unicharset, &ambigs_file, ambigs_debug_level, use_ambigs_for_adaption, &unicharset); if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n"); } // Load Cube objects if necessary. if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) { ASSERT_HOST(init_cube_objects(false, &tessdata_manager)); if (tessdata_manager_debug_level) tprintf("Loaded Cube w/out combiner\n"); } else if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) { ASSERT_HOST(init_cube_objects(true, &tessdata_manager)); if (tessdata_manager_debug_level) tprintf("Loaded Cube with combiner\n"); } // Init ParamsModel. // Load pass1 and pass2 weights (for now these two sets are the same, but in // the future separate sets of weights can be generated). for (int p = ParamsModel::PTRAIN_PASS1; p < ParamsModel::PTRAIN_NUM_PASSES; ++p) { language_model_->getParamsModel().SetPass( static_cast<ParamsModel::PassEnum>(p)); if (tessdata_manager.SeekToStart(TESSDATA_PARAMS_MODEL)) { if (!language_model_->getParamsModel().LoadFromFp( lang.string(), tessdata_manager.GetDataFilePtr(), tessdata_manager.GetEndOffset(TESSDATA_PARAMS_MODEL))) { return false; } } } if (tessdata_manager_debug_level) language_model_->getParamsModel().Print(); return true; }
// Returns false if a unicharset file for the specified language was not found // or was invalid. // This function initializes TessdataManager. After TessdataManager is // no longer needed, TessdataManager::End() should be called. // // This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless // it is OEM_DEFAULT, in which case the value of the variable will be obtained // from the language-specific config file (stored in [lang].traineddata), from // the config files specified on the command line or left as the default // OEM_TESSERACT_ONLY if none of the configs specify this variable. bool Tesseract::init_tesseract_lang_data( const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector<STRING> *vars_vec, const GenericVector<STRING> *vars_values, bool set_only_init_params) { // Set the basename, compute the data directory. #if _BUILDASDLL imagebasename = textbase; /*name of image */ STRING dll_module_name; #ifdef __MSW32__ dll_module_name = tessedit_module_name; #endif if (getpath(arg0, dll_module_name, datadir) < 0) return false; #else main_setup(arg0, textbase); #endif // Set the language data path prefix lang = language != NULL ? language : "eng"; language_data_path_prefix = datadir; language_data_path_prefix += lang; language_data_path_prefix += "."; // Initialize TessdataManager. STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix; if (!tessdata_manager.Init(tessdata_path.string(), tessdata_manager_debug_level)) { return false; } // If a language specific config file (lang.config) exists, load it in. if (tessdata_manager.SeekToStart(TESSDATA_LANG_CONFIG)) { ParamUtils::ReadParamsFromFp( tessdata_manager.GetDataFilePtr(), tessdata_manager.GetEndOffset(TESSDATA_LANG_CONFIG), false, this->params()); if (tessdata_manager_debug_level) { tprintf("Loaded language config file\n"); } } // Load tesseract variables from config files. This is done after loading // language-specific variables from [lang].traineddata file, so that custom // config files can override values in [lang].traineddata file. for (int i = 0; i < configs_size; ++i) { read_config_file(configs[i], set_only_init_params); } // Set params specified in vars_vec (done after setting params from config // files, so that params in vars_vec can override those from files). if (vars_vec != NULL && vars_values != NULL) { for (int i = 0; i < vars_vec->size(); ++i) { if (!ParamUtils::SetParam((*vars_vec)[i].string(), (*vars_values)[i].string(), set_only_init_params, this->params())) { tprintf("Error setting param %s\n", (*vars_vec)[i].string()); exit(1); } } } if (((STRING &)tessedit_write_params_to_file).length() > 0) { FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb"); if (params_file != NULL) { ParamUtils::PrintParams(params_file, this->params()); fclose(params_file); if (tessdata_manager_debug_level > 0) { tprintf("Wrote parameters to %s\n", tessedit_write_params_to_file.string()); } } else { tprintf("Failed to open %s for writing params.\n", tessedit_write_params_to_file.string()); } } // Determine which ocr engine(s) should be loaded and used for recognition. if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem); if (tessdata_manager_debug_level) { tprintf("Loading Tesseract/Cube with tessedit_ocr_engine_mode %d\n", static_cast<int>(tessedit_ocr_engine_mode)); } // Load the unicharset if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) || !unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) { return false; } if (unicharset.size() > MAX_NUM_CLASSES) { tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n"); return false; } right_to_left_ = unicharset.any_right_to_left(); if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n"); if (!tessedit_ambigs_training && tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) { unichar_ambigs.LoadUnicharAmbigs( tessdata_manager.GetDataFilePtr(), tessdata_manager.GetEndOffset(TESSDATA_AMBIGS), ambigs_debug_level, use_ambigs_for_adaption, &unicharset); if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n"); } // Load Cube objects if necessary. if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) { ASSERT_HOST(init_cube_objects(false, &tessdata_manager)); if (tessdata_manager_debug_level) tprintf("Loaded Cube w/out combiner\n"); } else if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) { ASSERT_HOST(init_cube_objects(true, &tessdata_manager)); if (tessdata_manager_debug_level) tprintf("Loaded Cube with combiner\n"); } return true; }