// Helper to set the properties for an input unicharset file, writes to the
// output file. If an appropriate script unicharset can be found in the
// script_dir directory, then the tops and bottoms are expanded using the
// script unicharset.
// If non-empty, xheight data for the fonts are written to the xheights_file.
void SetPropertiesForInputFile(const std::string& script_dir,
                               const std::string& input_unicharset_file,
                               const std::string& output_unicharset_file,
                               const std::string& output_xheights_file) {
  UNICHARSET unicharset;

  // Load the input unicharset
  unicharset.load_from_file(input_unicharset_file.c_str());
  tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(),
          input_unicharset_file.c_str());

  // Set unichar properties
  tprintf("Setting unichar properties\n");
  SetupBasicProperties(true, false, &unicharset);
  tprintf("Setting script properties\n");
  SetScriptProperties(script_dir, &unicharset);
  if (!output_xheights_file.empty()) {
    std::string xheights_str = GetXheightString(script_dir, unicharset);
    File::WriteStringToFileOrDie(xheights_str, output_xheights_file);
  }

  // Write the output unicharset
  tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str());
  unicharset.save_to_file(output_unicharset_file.c_str());
}
// Helper to set the properties for an input unicharset file, writes to the
// output file. If an appropriate script unicharset can be found in the
// script_dir directory, then the tops and bottoms are expanded using the
// script unicharset.
// If non-empty, xheight data for the fonts are written to the xheights_file.
void SetPropertiesForInputFile(const string& script_dir,
                               const string& input_unicharset_file,
                               const string& output_unicharset_file,
                               const string& output_xheights_file) {
  UNICHARSET unicharset;

  // Load the input unicharset
  unicharset.load_from_file(input_unicharset_file.c_str());
  tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(),
          input_unicharset_file.c_str());

  // Set unichar properties
  tprintf("Setting unichar properties\n");
  SetupBasicProperties(true, false, &unicharset);
  string xheights_str;
  for (int s = 0; s < unicharset.get_script_table_size(); ++s) {
    // Load the unicharset for the script if available.
    string filename = script_dir + "/" +
        unicharset.get_script_from_script_id(s) + ".unicharset";
    UNICHARSET script_set;
    if (script_set.load_from_file(filename.c_str())) {
      unicharset.SetPropertiesFromOther(script_set);
    }
    // Load the xheights for the script if available.
    filename = script_dir + "/" + unicharset.get_script_from_script_id(s) +
        ".xheights";
    string script_heights;
    if (File::ReadFileToString(filename, &script_heights))
      xheights_str += script_heights;
  }
  if (!output_xheights_file.empty())
    File::WriteStringToFileOrDie(xheights_str, output_xheights_file);
  for (int c = SPECIAL_UNICHAR_CODES_COUNT; c < unicharset.size(); ++c) {
    if (unicharset.PropertiesIncomplete(c)) {
      tprintf("Warning: properties incomplete for index %d = %s\n",
              c, unicharset.id_to_unichar(c));
    }
  }

  // Write the output unicharset
  tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str());
  unicharset.save_to_file(output_unicharset_file.c_str());
}
int main(int argc, char** argv) {
  // Sets properties on the input unicharset file, and writes:
  //   rootdir/lang/lang.charset_size=ddd.txt
  //   rootdir/lang/lang.traineddata
  //   rootdir/lang/lang.unicharset
  // If the 3 word lists are provided, the dawgs are also added
  // to the traineddata file.
  // The output unicharset and charset_size files are just for
  // human readability.
  tesseract::CheckSharedLibraryVersion();
  tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);

  GenericVector<STRING> words, puncs, numbers;
  // If these reads fail, we get a warning message and an empty list of words.
  tesseract::ReadFile(FLAGS_words.c_str(), nullptr).split('\n', &words);
  tesseract::ReadFile(FLAGS_puncs.c_str(), nullptr).split('\n', &puncs);
  tesseract::ReadFile(FLAGS_numbers.c_str(), nullptr).split('\n', &numbers);
  // Load the input unicharset
  UNICHARSET unicharset;
  if (!unicharset.load_from_file(FLAGS_input_unicharset.c_str(), false)) {
    tprintf("Failed to load unicharset from %s\n",
            FLAGS_input_unicharset.c_str());
    return 1;
  }
  tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(),
          FLAGS_input_unicharset.c_str());

  // Set unichar properties
  tprintf("Setting unichar properties\n");
  tesseract::SetupBasicProperties(/*report_errors*/ true,
                                  /*decompose (NFD)*/ false, &unicharset);
  tprintf("Setting script properties\n");
  tesseract::SetScriptProperties(FLAGS_script_dir.c_str(), &unicharset);
  // Combine everything into a traineddata file.
  return tesseract::CombineLangModel(
      unicharset, FLAGS_script_dir.c_str(), FLAGS_version_str.c_str(),
      FLAGS_output_dir.c_str(), FLAGS_lang.c_str(), FLAGS_pass_through_recoder,
      words, puncs, numbers, FLAGS_lang_is_rtl, /*reader*/ nullptr,
      /*writer*/ nullptr);
}
Esempio n. 4
0
int main(int argc, char *argv[]) {
  if (argc != 4) {
    tprintf("Print all the words in a given dawg.\n");
    tprintf("Usage: %s <unicharset> <dawgfile> <wordlistfile>\n",
            argv[0]);
    return 1;
  }
  const char *unicharset_file = argv[1];
  const char *dawg_file = argv[2];
  const char *wordlist_file = argv[3];
  UNICHARSET unicharset;
  if (!unicharset.load_from_file(unicharset_file)) {
    tprintf("Error loading unicharset from %s.\n", unicharset_file);
    return 1;
  }
  tesseract::Dawg *dict = LoadSquishedDawg(unicharset, dawg_file);
  if (dict == NULL) {
    tprintf("Error loading dictionary from %s.\n", dawg_file);
    return 1;
  }
  int retval = WriteDawgAsWordlist(unicharset, dict, wordlist_file);
  delete dict;
  return retval;
}
Esempio n. 5
0
int Main() {
  if (FLAGS_list_available_fonts) {
    const std::vector<std::string>& all_fonts = FontUtils::ListAvailableFonts();
    for (unsigned int i = 0; i < all_fonts.size(); ++i) {
      printf("%3u: %s\n", i, all_fonts[i].c_str());
      ASSERT_HOST_MSG(FontUtils::IsAvailableFont(all_fonts[i].c_str()),
                      "Font %s is unrecognized.\n", all_fonts[i].c_str());
    }
    return EXIT_SUCCESS;
  }

  // Check validity of input flags.
  if (FLAGS_text.empty()) {
    tprintf("'--text' option is missing!\n");
    exit(1);
  }
  if (FLAGS_outputbase.empty()) {
    tprintf("'--outputbase' option is missing!\n");
    exit(1);
  }
  if (!FLAGS_unicharset_file.empty() && FLAGS_render_ngrams) {
    tprintf("Use '--unicharset_file' only if '--render_ngrams' is set.\n");
    exit(1);
  }

  if (!FLAGS_find_fonts && !FontUtils::IsAvailableFont(FLAGS_font.c_str())) {
    std::string pango_name;
    if (!FontUtils::IsAvailableFont(FLAGS_font.c_str(), &pango_name)) {
      tprintf("Could not find font named %s.\n", FLAGS_font.c_str());
      if (!pango_name.empty()) {
        tprintf("Pango suggested font %s.\n", pango_name.c_str());
      }
      tprintf("Please correct --font arg.\n");
      exit(1);
    }
  }

  if (FLAGS_render_ngrams)
    FLAGS_output_word_boxes = true;

  char font_desc_name[1024];
  snprintf(font_desc_name, 1024, "%s %d", FLAGS_font.c_str(),
           static_cast<int>(FLAGS_ptsize));
  StringRenderer render(font_desc_name, FLAGS_xsize, FLAGS_ysize);
  render.set_add_ligatures(FLAGS_ligatures);
  render.set_leading(FLAGS_leading);
  render.set_resolution(FLAGS_resolution);
  render.set_char_spacing(FLAGS_char_spacing * FLAGS_ptsize);
  render.set_h_margin(FLAGS_margin);
  render.set_v_margin(FLAGS_margin);
  render.set_output_word_boxes(FLAGS_output_word_boxes);
  render.set_box_padding(FLAGS_box_padding);
  render.set_strip_unrenderable_words(FLAGS_strip_unrenderable_words);
  render.set_underline_start_prob(FLAGS_underline_start_prob);
  render.set_underline_continuation_prob(FLAGS_underline_continuation_prob);

  // Set text rendering orientation and their forms.
  if (FLAGS_writing_mode == "horizontal") {
    // Render regular horizontal text (default).
    render.set_vertical_text(false);
    render.set_gravity_hint_strong(false);
    render.set_render_fullwidth_latin(false);
  } else if (FLAGS_writing_mode == "vertical") {
    // Render vertical text. Glyph orientation is selected by Pango.
    render.set_vertical_text(true);
    render.set_gravity_hint_strong(false);
    render.set_render_fullwidth_latin(false);
  } else if (FLAGS_writing_mode == "vertical-upright") {
    // Render vertical text. Glyph orientation is set to be upright.
    // Also Basic Latin characters are converted to their fullwidth forms
    // on rendering, since fullwidth Latin characters are well designed to fit
    // vertical text lines, while .box files store halfwidth Basic Latin
    // unichars.
    render.set_vertical_text(true);
    render.set_gravity_hint_strong(true);
    render.set_render_fullwidth_latin(true);
  } else {
    tprintf("Invalid writing mode: %s\n", FLAGS_writing_mode.c_str());
    exit(1);
  }

  std::string src_utf8;
  // This c_str is NOT redundant!
  if (!File::ReadFileToString(FLAGS_text.c_str(), &src_utf8)) {
    tprintf("Failed to read file: %s\n", FLAGS_text.c_str());
    exit(1);
  }

  // Remove the unicode mark if present.
  if (strncmp(src_utf8.c_str(), "\xef\xbb\xbf", 3) == 0) {
    src_utf8.erase(0, 3);
  }
  tlog(1, "Render string of size %d\n", src_utf8.length());

  if (FLAGS_render_ngrams || FLAGS_only_extract_font_properties) {
    // Try to preserve behavior of old text2image by expanding inter-word
    // spaces by a factor of 4.
    const std::string kSeparator = FLAGS_render_ngrams ? "    " : " ";
    // Also restrict the number of characters per line to try and avoid
    // line-breaking in the middle of words like "-A", "R$" etc. which are
    // otherwise allowed by the standard unicode line-breaking rules.
    const unsigned int kCharsPerLine = (FLAGS_ptsize > 20) ? 50 : 100;
    std::string rand_utf8;
    UNICHARSET unicharset;
    if (FLAGS_render_ngrams && !FLAGS_unicharset_file.empty() &&
        !unicharset.load_from_file(FLAGS_unicharset_file.c_str())) {
      tprintf("Failed to load unicharset from file %s\n",
              FLAGS_unicharset_file.c_str());
      exit(1);
    }

    // If we are rendering ngrams that will be OCRed later, shuffle them so that
    // tesseract does not have difficulties finding correct baseline, word
    // spaces, etc.
    const char *str8 = src_utf8.c_str();
    int len = src_utf8.length();
    int step;
    std::vector<std::pair<int, int> > offsets;
    int offset = SpanUTF8Whitespace(str8);
    while (offset < len) {
      step = SpanUTF8NotWhitespace(str8 + offset);
      offsets.push_back(std::make_pair(offset, step));
      offset += step;
      offset += SpanUTF8Whitespace(str8 + offset);
    }
    if (FLAGS_render_ngrams)
      std::random_shuffle(offsets.begin(), offsets.end());

    for (size_t i = 0, line = 1; i < offsets.size(); ++i) {
      const char *curr_pos = str8 + offsets[i].first;
      int ngram_len = offsets[i].second;
      // Skip words that contain characters not in found in unicharset.
      std::string cleaned = UNICHARSET::CleanupString(curr_pos, ngram_len);
      if (!FLAGS_unicharset_file.empty() &&
          !unicharset.encodable_string(cleaned.c_str(), nullptr)) {
        continue;
      }
      rand_utf8.append(curr_pos, ngram_len);
      if (rand_utf8.length() > line * kCharsPerLine) {
        rand_utf8.append(" \n");
        ++line;
        if (line & 0x1) rand_utf8.append(kSeparator);
      } else {
        rand_utf8.append(kSeparator);
      }
    }
    tlog(1, "Rendered ngram string of size %d\n", rand_utf8.length());
    src_utf8.swap(rand_utf8);
  }
  if (FLAGS_only_extract_font_properties) {
    tprintf("Extracting font properties only\n");
    ExtractFontProperties(src_utf8, &render, FLAGS_outputbase.c_str());
    tprintf("Done!\n");
    return 0;
  }

  int im = 0;
  std::vector<float> page_rotation;
  const char* to_render_utf8 = src_utf8.c_str();

  tesseract::TRand randomizer;
  randomizer.set_seed(kRandomSeed);
  std::vector<std::string> font_names;
  // We use a two pass mechanism to rotate images in both direction.
  // The first pass(0) will rotate the images in random directions and
  // the second pass(1) will mirror those rotations.
  int num_pass = FLAGS_bidirectional_rotation ? 2 : 1;
  for (int pass = 0; pass < num_pass; ++pass) {
    int page_num = 0;
    std::string font_used;
    for (size_t offset = 0;
         offset < strlen(to_render_utf8) &&
         (FLAGS_max_pages == 0 || page_num < FLAGS_max_pages);
         ++im, ++page_num) {
      tlog(1, "Starting page %d\n", im);
      Pix* pix = nullptr;
      if (FLAGS_find_fonts) {
        offset += render.RenderAllFontsToImage(FLAGS_min_coverage,
                                               to_render_utf8 + offset,
                                               strlen(to_render_utf8 + offset),
                                               &font_used, &pix);
      } else {
        offset += render.RenderToImage(to_render_utf8 + offset,
                                       strlen(to_render_utf8 + offset), &pix);
      }
      if (pix != nullptr) {
        float rotation = 0;
        if (pass == 1) {
          // Pass 2, do mirror rotation.
          rotation = -1 * page_rotation[page_num];
        }
        if (FLAGS_degrade_image) {
          pix = DegradeImage(pix, FLAGS_exposure, &randomizer,
                             FLAGS_rotate_image ? &rotation : nullptr);
        }
        render.RotatePageBoxes(rotation);

        if (pass == 0) {
          // Pass 1, rotate randomly and store the rotation..
          page_rotation.push_back(rotation);
        }

        Pix* gray_pix = pixConvertTo8(pix, false);
        pixDestroy(&pix);
        Pix* binary = pixThresholdToBinary(gray_pix, 128);
        pixDestroy(&gray_pix);
        char tiff_name[1024];
        if (FLAGS_find_fonts) {
          if (FLAGS_render_per_font) {
            std::string fontname_for_file = tesseract::StringReplace(
                font_used, " ", "_");
            snprintf(tiff_name, 1024, "%s.%s.tif", FLAGS_outputbase.c_str(),
                     fontname_for_file.c_str());
            pixWriteTiff(tiff_name, binary, IFF_TIFF_G4, "w");
            tprintf("Rendered page %d to file %s\n", im, tiff_name);
          } else {
            font_names.push_back(font_used);
          }
        } else {
          snprintf(tiff_name, 1024, "%s.tif", FLAGS_outputbase.c_str());
          pixWriteTiff(tiff_name, binary, IFF_TIFF_G4, im == 0 ? "w" : "a");
          tprintf("Rendered page %d to file %s\n", im, tiff_name);
        }
        // Make individual glyphs
        if (FLAGS_output_individual_glyph_images) {
          if (!MakeIndividualGlyphs(binary, render.GetBoxes(), im)) {
            tprintf("ERROR: Individual glyphs not saved\n");
          }
        }
        pixDestroy(&binary);
      }
      if (FLAGS_find_fonts && offset != 0) {
        // We just want a list of names, or some sample images so we don't need
        // to render more than the first page of the text.
        break;
      }
    }
  }
  if (!FLAGS_find_fonts) {
    std::string box_name = FLAGS_outputbase.c_str();
    box_name += ".box";
    render.WriteAllBoxes(box_name);
  } else if (!FLAGS_render_per_font && !font_names.empty()) {
    std::string filename = FLAGS_outputbase.c_str();
    filename += ".fontlist.txt";
    FILE* fp = fopen(filename.c_str(), "wb");
    if (fp == nullptr) {
      tprintf("Failed to create output font list %s\n", filename.c_str());
    } else {
      for (size_t i = 0; i < font_names.size(); ++i) {
        fprintf(fp, "%s\n", font_names[i].c_str());
      }
      fclose(fp);
    }
  }

  return 0;
}
Esempio n. 6
0
// Apart from command-line flags, input is a collection of lstmf files, that
// were previously created using tesseract with the lstm.train config file.
// The program iterates over the inputs, feeding the data to the network,
// until the error rate reaches a specified target or max_iterations is reached.
int main(int argc, char **argv) {
  ParseArguments(&argc, &argv);
  // Purify the model name in case it is based on the network string.
  if (FLAGS_model_output.empty()) {
    tprintf("Must provide a --model_output!\n");
    return 1;
  }
  STRING model_output = FLAGS_model_output.c_str();
  for (int i = 0; i < model_output.length(); ++i) {
    if (model_output[i] == '[' || model_output[i] == ']')
      model_output[i] = '-';
    if (model_output[i] == '(' || model_output[i] == ')')
      model_output[i] = '_';
  }
  // Setup the trainer.
  STRING checkpoint_file = FLAGS_model_output.c_str();
  checkpoint_file += "_checkpoint";
  STRING checkpoint_bak = checkpoint_file + ".bak";
  tesseract::LSTMTrainer trainer(
      NULL, NULL, NULL, NULL, FLAGS_model_output.c_str(),
      checkpoint_file.c_str(), FLAGS_debug_interval,
      static_cast<inT64>(FLAGS_max_image_MB) * 1048576);

  // Reading something from an existing model doesn't require many flags,
  // so do it now and exit.
  if (FLAGS_stop_training || FLAGS_debug_network) {
    if (!trainer.TryLoadingCheckpoint(FLAGS_continue_from.c_str())) {
      tprintf("Failed to read continue from: %s\n",
              FLAGS_continue_from.c_str());
      return 1;
    }
    if (FLAGS_debug_network) {
      trainer.DebugNetwork();
    } else {
      if (FLAGS_train_mode & tesseract::TF_INT_MODE)
        trainer.ConvertToInt();
      GenericVector<char> recognizer_data;
      trainer.SaveRecognitionDump(&recognizer_data);
      if (!tesseract::SaveDataToFile(recognizer_data,
                                     FLAGS_model_output.c_str())) {
        tprintf("Failed to write recognition model : %s\n",
                FLAGS_model_output.c_str());
      }
    }
    return 0;
  }

  // Get the list of files to process.
  if (FLAGS_train_listfile.empty()) {
    tprintf("Must supply a list of training filenames! --train_listfile\n");
    return 1;
  }
  GenericVector<STRING> filenames;
  if (!tesseract::LoadFileLinesToStrings(FLAGS_train_listfile.c_str(),
                                         &filenames)) {
    tprintf("Failed to load list of training filenames from %s\n",
            FLAGS_train_listfile.c_str());
    return 1;
  }

  UNICHARSET unicharset;
  // Checkpoints always take priority if they are available.
  if (trainer.TryLoadingCheckpoint(checkpoint_file.string()) ||
      trainer.TryLoadingCheckpoint(checkpoint_bak.string())) {
    tprintf("Successfully restored trainer from %s\n",
            checkpoint_file.string());
  } else {
    if (!FLAGS_continue_from.empty()) {
      // Load a past model file to improve upon.
      if (!trainer.TryLoadingCheckpoint(FLAGS_continue_from.c_str())) {
        tprintf("Failed to continue from: %s\n", FLAGS_continue_from.c_str());
        return 1;
      }
      tprintf("Continuing from %s\n", FLAGS_continue_from.c_str());
      trainer.InitIterations();
    }
    if (FLAGS_continue_from.empty() || FLAGS_append_index >= 0) {
      // We need a unicharset to start from scratch or append.
      string unicharset_str;
      // Character coding to be used by the classifier.
      if (!unicharset.load_from_file(FLAGS_U.c_str())) {
        tprintf("Error: must provide a -U unicharset!\n");
        return 1;
      }
      tesseract::SetupBasicProperties(true, &unicharset);
      if (FLAGS_append_index >= 0) {
        tprintf("Appending a new network to an old one!!");
        if (FLAGS_continue_from.empty()) {
          tprintf("Must set --continue_from for appending!\n");
          return 1;
        }
      }
      // We are initializing from scratch.
      trainer.InitCharSet(unicharset, FLAGS_script_dir.c_str(),
                          FLAGS_train_mode);
      if (!trainer.InitNetwork(FLAGS_net_spec.c_str(), FLAGS_append_index,
                               FLAGS_net_mode, FLAGS_weight_range,
                               FLAGS_learning_rate, FLAGS_momentum)) {
        tprintf("Failed to create network from spec: %s\n",
                FLAGS_net_spec.c_str());
        return 1;
      }
      trainer.set_perfect_delay(FLAGS_perfect_sample_delay);
    }
  }
  if (!trainer.LoadAllTrainingData(filenames)) {
    tprintf("Load of images failed!!\n");
    return 1;
  }

  bool best_dumped = true;
  char* best_model_dump = NULL;
  size_t best_model_size = 0;
  STRING best_model_name;
  tesseract::LSTMTester tester(static_cast<inT64>(FLAGS_max_image_MB) *
                               1048576);
  tesseract::TestCallback tester_callback = nullptr;
  if (!FLAGS_eval_listfile.empty()) {
    if (!tester.LoadAllEvalData(FLAGS_eval_listfile.c_str())) {
      tprintf("Failed to load eval data from: %s\n",
              FLAGS_eval_listfile.c_str());
      return 1;
    }
    tester_callback =
        NewPermanentTessCallback(&tester, &tesseract::LSTMTester::RunEvalAsync);
  }
  do {
    // Train a few.
    int iteration = trainer.training_iteration();
    for (int target_iteration = iteration + kNumPagesPerBatch;
         iteration < target_iteration;
         iteration = trainer.training_iteration()) {
      trainer.TrainOnLine(&trainer, false);
    }
    STRING log_str;
    trainer.MaintainCheckpoints(tester_callback, &log_str);
    tprintf("%s\n", log_str.string());
  } while (trainer.best_error_rate() > FLAGS_target_error_rate &&
           (trainer.training_iteration() < FLAGS_max_iterations ||
            FLAGS_max_iterations == 0));
  delete tester_callback;
  tprintf("Finished! Error rate = %g\n", trainer.best_error_rate());
  return 0;
} /* main */