Exemple #1
0
// Returns false if a unicharset file for the specified language was not found
// or was invalid.
// This function initializes TessdataManager. After TessdataManager is
// no longer needed, TessdataManager::End() should be called.
//
// This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
// it is OEM_DEFAULT, in which case the value of the variable will be obtained
// from the language-specific config file (stored in [lang].traineddata), from
// the config files specified on the command line or left as the default
// OEM_TESSERACT_ONLY if none of the configs specify this variable.
bool Tesseract::init_tesseract_lang_data(
    const char *arg0, const char *textbase, const char *language,
    OcrEngineMode oem, char **configs, int configs_size,
    const GenericVector<STRING> *vars_vec,
    const GenericVector<STRING> *vars_values,
    bool set_only_non_debug_params) {
  // Set the basename, compute the data directory.
  main_setup(arg0, textbase);

  // Set the language data path prefix
  lang = language != NULL ? language : "eng";
  language_data_path_prefix = datadir;
  language_data_path_prefix += lang;
  language_data_path_prefix += ".";

  // Initialize TessdataManager.
  STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
  if (!tessdata_manager.Init(tessdata_path.string(),
                             tessdata_manager_debug_level)) {
    return false;
  }

  // If a language specific config file (lang.config) exists, load it in.
  if (tessdata_manager.SeekToStart(TESSDATA_LANG_CONFIG)) {
    ParamUtils::ReadParamsFromFp(
        tessdata_manager.GetDataFilePtr(),
        tessdata_manager.GetEndOffset(TESSDATA_LANG_CONFIG),
        SET_PARAM_CONSTRAINT_NONE, this->params());
    if (tessdata_manager_debug_level) {
      tprintf("Loaded language config file\n");
    }
  }

  SetParamConstraint set_params_constraint = set_only_non_debug_params ?
      SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE;
  // Load tesseract variables from config files. This is done after loading
  // language-specific variables from [lang].traineddata file, so that custom
  // config files can override values in [lang].traineddata file.
  for (int i = 0; i < configs_size; ++i) {
    read_config_file(configs[i], set_params_constraint);
  }

  // Set params specified in vars_vec (done after setting params from config
  // files, so that params in vars_vec can override those from files).
  if (vars_vec != NULL && vars_values != NULL) {
    for (int i = 0; i < vars_vec->size(); ++i) {
      if (!ParamUtils::SetParam((*vars_vec)[i].string(),
                                (*vars_values)[i].string(),
                                set_params_constraint, this->params())) {
        tprintf("Error setting param %s\n", (*vars_vec)[i].string());
        exit(1);
      }
    }
  }

  if (((STRING &)tessedit_write_params_to_file).length() > 0) {
    FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb");
    if (params_file != NULL) {
      ParamUtils::PrintParams(params_file, this->params());
      fclose(params_file);
      if (tessdata_manager_debug_level > 0) {
        tprintf("Wrote parameters to %s\n",
                tessedit_write_params_to_file.string());
      }
    } else {
      tprintf("Failed to open %s for writing params.\n",
              tessedit_write_params_to_file.string());
    }
  }

  // Determine which ocr engine(s) should be loaded and used for recognition.
  if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
  if (tessdata_manager_debug_level) {
    tprintf("Loading Tesseract/Cube with tessedit_ocr_engine_mode %d\n",
            static_cast<int>(tessedit_ocr_engine_mode));
  }

  // If we are only loading the config file (and so not planning on doing any
  // recognition) then there's nothing else do here.
  if (tessedit_init_config_only) {
    if (tessdata_manager_debug_level) {
      tprintf("Returning after loading config file\n");
    }
    return true;
  }

  // Load the unicharset
  if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) ||
      !unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) {
    return false;
  }
  if (unicharset.size() > MAX_NUM_CLASSES) {
    tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
    return false;
  }
  if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n");
  right_to_left_ = unicharset.major_right_to_left();

  // Setup initial unichar ambigs table and read universal ambigs.
  UNICHARSET encoder_unicharset;
  encoder_unicharset.CopyFrom(unicharset);
  unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);
  unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);

  if (!tessedit_ambigs_training &&
      tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) {
    TFile ambigs_file;
    ambigs_file.Open(tessdata_manager.GetDataFilePtr(),
                     tessdata_manager.GetEndOffset(TESSDATA_AMBIGS) + 1);
    unichar_ambigs.LoadUnicharAmbigs(
        encoder_unicharset,
        &ambigs_file,
        ambigs_debug_level, use_ambigs_for_adaption, &unicharset);
    if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n");
  }

  // Load Cube objects if necessary.
  if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {
    ASSERT_HOST(init_cube_objects(false, &tessdata_manager));
    if (tessdata_manager_debug_level)
      tprintf("Loaded Cube w/out combiner\n");
  } else if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) {
    ASSERT_HOST(init_cube_objects(true, &tessdata_manager));
    if (tessdata_manager_debug_level)
      tprintf("Loaded Cube with combiner\n");
  }

  // Init ParamsModel.
  // Load pass1 and pass2 weights (for now these two sets are the same, but in
  // the future separate sets of weights can be generated).
  for (int p = ParamsModel::PTRAIN_PASS1;
      p < ParamsModel::PTRAIN_NUM_PASSES; ++p) {
    language_model_->getParamsModel().SetPass(
        static_cast<ParamsModel::PassEnum>(p));
    if (tessdata_manager.SeekToStart(TESSDATA_PARAMS_MODEL)) {
      if (!language_model_->getParamsModel().LoadFromFp(
          lang.string(), tessdata_manager.GetDataFilePtr(),
          tessdata_manager.GetEndOffset(TESSDATA_PARAMS_MODEL))) {
        return false;
      }
    }
  }
  if (tessdata_manager_debug_level) language_model_->getParamsModel().Print();

  return true;
}
// Returns false if a unicharset file for the specified language was not found
// or was invalid.
// This function initializes TessdataManager. After TessdataManager is
// no longer needed, TessdataManager::End() should be called.
//
// This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
// it is OEM_DEFAULT, in which case the value of the variable will be obtained
// from the language-specific config file (stored in [lang].traineddata), from
// the config files specified on the command line or left as the default
// OEM_TESSERACT_ONLY if none of the configs specify this variable.
bool Tesseract::init_tesseract_lang_data(
    const char *arg0, const char *textbase, const char *language,
    OcrEngineMode oem, char **configs, int configs_size,
    const GenericVector<STRING> *vars_vec,
    const GenericVector<STRING> *vars_values,
    bool set_only_init_params) {
  // Set the basename, compute the data directory.
	 #if _BUILDASDLL
		imagebasename = textbase;      /*name of image */
		STRING dll_module_name;
	#ifdef __MSW32__
		dll_module_name = tessedit_module_name;
	#endif
		if (getpath(arg0, dll_module_name, datadir) < 0)
			return false;
	#else
		main_setup(arg0, textbase);
	#endif

  // Set the language data path prefix
  lang = language != NULL ? language : "eng";
  language_data_path_prefix = datadir;
  language_data_path_prefix += lang;
  language_data_path_prefix += ".";

  // Initialize TessdataManager.
  STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
  if (!tessdata_manager.Init(tessdata_path.string(),
                             tessdata_manager_debug_level)) {
    return false;
  }

  // If a language specific config file (lang.config) exists, load it in.
  if (tessdata_manager.SeekToStart(TESSDATA_LANG_CONFIG)) {
    ParamUtils::ReadParamsFromFp(
        tessdata_manager.GetDataFilePtr(),
        tessdata_manager.GetEndOffset(TESSDATA_LANG_CONFIG),
        false, this->params());
    if (tessdata_manager_debug_level) {
      tprintf("Loaded language config file\n");
    }
  }

  // Load tesseract variables from config files. This is done after loading
  // language-specific variables from [lang].traineddata file, so that custom
  // config files can override values in [lang].traineddata file.
  for (int i = 0; i < configs_size; ++i) {
    read_config_file(configs[i], set_only_init_params);
  }

  // Set params specified in vars_vec (done after setting params from config
  // files, so that params in vars_vec can override those from files).
  if (vars_vec != NULL && vars_values != NULL) {
    for (int i = 0; i < vars_vec->size(); ++i) {
      if (!ParamUtils::SetParam((*vars_vec)[i].string(),
                                (*vars_values)[i].string(),
                                set_only_init_params, this->params())) {
        tprintf("Error setting param %s\n", (*vars_vec)[i].string());
        exit(1);
      }
    }
  }

  if (((STRING &)tessedit_write_params_to_file).length() > 0) {
    FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb");
    if (params_file != NULL) {
      ParamUtils::PrintParams(params_file, this->params());
      fclose(params_file);
      if (tessdata_manager_debug_level > 0) {
        tprintf("Wrote parameters to %s\n",
                tessedit_write_params_to_file.string());
      }
    } else {
      tprintf("Failed to open %s for writing params.\n",
              tessedit_write_params_to_file.string());
    }
  }

  // Determine which ocr engine(s) should be loaded and used for recognition.
  if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
  if (tessdata_manager_debug_level) {
    tprintf("Loading Tesseract/Cube with tessedit_ocr_engine_mode %d\n",
            static_cast<int>(tessedit_ocr_engine_mode));
  }

  // Load the unicharset
  if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) ||
      !unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) {
    return false;
  }
  if (unicharset.size() > MAX_NUM_CLASSES) {
    tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
    return false;
  }
  right_to_left_ = unicharset.any_right_to_left();
  if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n");

  if (!tessedit_ambigs_training &&
      tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) {
    unichar_ambigs.LoadUnicharAmbigs(
        tessdata_manager.GetDataFilePtr(),
        tessdata_manager.GetEndOffset(TESSDATA_AMBIGS),
        ambigs_debug_level, use_ambigs_for_adaption, &unicharset);
    if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n");
  }

  // Load Cube objects if necessary.
  if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {
    ASSERT_HOST(init_cube_objects(false, &tessdata_manager));
    if (tessdata_manager_debug_level)
      tprintf("Loaded Cube w/out combiner\n");
  } else if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) {
    ASSERT_HOST(init_cube_objects(true, &tessdata_manager));
    if (tessdata_manager_debug_level)
      tprintf("Loaded Cube with combiner\n");
  }

  return true;
}