Exemple #1
0
 char_32 *CubeUtils::ToUpper(const char_32 *str32, CharSet *char_set) {
     if (!char_set) {
         return NULL;
     }
     UNICHARSET *unicharset = char_set->InternalUnicharset();
     int len = StrLen(str32);
     char_32 *upper = new char_32[len + 1];
     if (!upper)
         return NULL;
     for (int i = 0; i < len; ++i) {
         char_32 ch = str32[i];
         if (ch == INVALID_UNICHAR_ID) {
             delete[] upper;
             return NULL;
         }
         // convert lower-case characters to upper-case
         if (unicharset->get_islower(char_set->ClassID(ch))) {
             UNICHAR_ID uid_upper = unicharset->get_other_case(char_set->ClassID(ch));
             const char_32 *str32_upper = char_set->ClassString(uid_upper);
             // expect upper-case version of character to be a single character
             if (!str32_upper || StrLen(str32_upper) != 1) {
                 delete[] upper;
                 return NULL;
             }
             upper[i] = str32_upper[0];
         } else {
             upper[i] = ch;
         }
     }
     upper[len] = 0;
     return upper;
 }
// Helper to set the properties for an input unicharset file, writes to the
// output file. If an appropriate script unicharset can be found in the
// script_dir directory, then the tops and bottoms are expanded using the
// script unicharset.
// If non-empty, xheight data for the fonts are written to the xheights_file.
void SetPropertiesForInputFile(const std::string& script_dir,
                               const std::string& input_unicharset_file,
                               const std::string& output_unicharset_file,
                               const std::string& output_xheights_file) {
  UNICHARSET unicharset;

  // Load the input unicharset
  unicharset.load_from_file(input_unicharset_file.c_str());
  tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(),
          input_unicharset_file.c_str());

  // Set unichar properties
  tprintf("Setting unichar properties\n");
  SetupBasicProperties(true, false, &unicharset);
  tprintf("Setting script properties\n");
  SetScriptProperties(script_dir, &unicharset);
  if (!output_xheights_file.empty()) {
    std::string xheights_str = GetXheightString(script_dir, unicharset);
    File::WriteStringToFileOrDie(xheights_str, output_xheights_file);
  }

  // Write the output unicharset
  tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str());
  unicharset.save_to_file(output_unicharset_file.c_str());
}
// Sets all the properties for this unicharset given a src unicharset with
// everything set. The unicharsets don't have to be the same, and graphemes
// are correctly accounted for.
void UNICHARSET::PartialSetPropertiesFromOther(int start_index,
                                               const UNICHARSET& src) {
  for (int ch = start_index; ch < size_used; ++ch) {
    const char* utf8 = id_to_unichar(ch);
    UNICHAR_PROPERTIES properties;
    if (src.GetStrProperties(utf8, &properties)) {
      // Setup the script_id, other_case, and mirror properly.
      const char* script = src.get_script_from_script_id(properties.script_id);
      properties.script_id = add_script(script);
      const char* other_case = src.id_to_unichar(properties.other_case);
      if (contains_unichar(other_case)) {
        properties.other_case = unichar_to_id(other_case);
      } else {
        properties.other_case = ch;
      }
      const char* mirror_str = src.id_to_unichar(properties.mirror);
      if (contains_unichar(mirror_str)) {
        properties.mirror = unichar_to_id(mirror_str);
      } else {
        properties.mirror = ch;
      }
      unichars[ch].properties.CopyFrom(properties);
      set_normed_ids(ch);
    } else {
      tprintf("Failed to get properties for index %d = %s\n", ch, utf8);
    }
  }
}
int main(int argc, char** argv) {
  int option;
  const char* output_directory = ".";
  STRING unicharset_file_name;
  // Special characters are now included by default.
  UNICHARSET unicharset;

  setlocale(LC_ALL, "");

  // Print usage
  if (argc <= 1) {
    printf("Usage: %s [-D DIRECTORY] FILE...\n", argv[0]);
    exit(1);

  }

  // Parse arguments
  while ((option = tessopt(argc, argv, "D" )) != EOF) {
    switch (option) {
      case 'D':
        output_directory = tessoptarg;
        ++tessoptind;
        break;
    }
  }

  // Save file name
  unicharset_file_name = output_directory;
  unicharset_file_name += "/";
  unicharset_file_name += kUnicharsetFileName;

  // Load box files
  for (; tessoptind < argc; ++tessoptind) {
    printf("Extracting unicharset from %s\n", argv[tessoptind]);

    FILE* box_file = fopen(argv[tessoptind], "rb");
    if (box_file == NULL) {
      printf("Cannot open box file %s\n", argv[tessoptind]);
      return -1;
    }

    TBOX box;
    STRING unichar_string;
    int line_number = 0;
    while (ReadNextBox(&line_number, box_file, &unichar_string, &box)) {
      unicharset.unichar_insert(unichar_string.string());
      set_properties(&unicharset, unichar_string.string());
    }
  }

  // Write unicharset file
  if (unicharset.save_to_file(unicharset_file_name.string())) {
    printf("Wrote unicharset file %s.\n", unicharset_file_name.string());
  }
  else {
    printf("Cannot save unicharset file %s.\n", unicharset_file_name.string());
    return -1;
  }
  return 0;
}
/**
 * print_ratings_info
 *
 * Send all the ratings out to the logfile.
 *
 * @param fp file to use
 * @param ratings list of results
 * @param current_unicharset unicharset that can be used
 * for id-to-unichar conversion
 */
void print_ratings_info(FILE *fp,
                        BLOB_CHOICE_LIST *ratings,
                        const UNICHARSET &current_unicharset) {
  inT32 index;                    // to list
  inT32 best_index;               // to list
  FLOAT32 best_rat;               // rating
  FLOAT32 best_cert;              // certainty
  const char* first_char = NULL;  // character
  FLOAT32 first_rat;              // rating
  FLOAT32 first_cert;             // certainty
  const char* sec_char = NULL;    // character
  FLOAT32 sec_rat = 0.0f;         // rating
  FLOAT32 sec_cert = 0.0f;        // certainty
  BLOB_CHOICE_IT c_it = ratings;  // iterator

  index = ratings->length();
  if (index > 0) {
    first_char = current_unicharset.id_to_unichar(c_it.data()->unichar_id());
    first_rat = c_it.data()->rating();
    first_cert = -c_it.data()->certainty();
    if (index > 1) {
      sec_char = current_unicharset.id_to_unichar(
          c_it.data_relative(1)->unichar_id());
      sec_rat = c_it.data_relative(1)->rating();
      sec_cert = -c_it.data_relative(1)->certainty();
    } else {
      sec_char = NULL;
      sec_rat = -1;
      sec_cert = -1;
    }
  } else {
    first_char = NULL;
    first_rat = -1;
    first_cert = -1;
  }
  best_index = -1;
  best_rat = -1;
  best_cert = -1;
  for (index = 0, c_it.mark_cycle_pt(); !c_it.cycled_list();
       c_it.forward(), index++) {
    if (strcmp(current_unicharset.id_to_unichar(c_it.data()->unichar_id()),
               blob_answer) == 0) {
      best_index = index;
      best_rat = c_it.data()->rating();
      best_cert = -c_it.data()->certainty();
    }
  }
  if (first_char != NULL && (*first_char == '\0' || *first_char == ' '))
    first_char = NULL;
  if (sec_char != NULL && (*sec_char == '\0' || *sec_char == ' '))
    sec_char = NULL;
  fprintf(matcher_fp,
          " " INT32FORMAT " " INT32FORMAT " %g %g %s %g %g %s %g %g\n",
          ratings->length(), best_index, best_rat, best_cert,
          first_char != NULL ? first_char : "~",
          first_rat, first_cert, sec_char != NULL ? sec_char : "~",
          sec_rat, sec_cert);
}
// Constructor is private. Only anticipated use of ErrorCounter is via
// the static ComputeErrorRate.
ErrorCounter::ErrorCounter(const UNICHARSET& unicharset, int fontsize)
  : scaled_error_(0.0), rating_epsilon_(kRatingEpsilon),
    unichar_counts_(unicharset.size(), unicharset.size(), 0),
    ok_score_hist_(0, 101), bad_score_hist_(0, 101),
    unicharset_(unicharset) {
  Counts empty_counts;
  font_counts_.init_to_size(fontsize, empty_counts);
  multi_unichar_counts_.init_to_size(unicharset.size(), 0);
}
// Helper adds all the scripts from sid_set converted to ids from osd_set to
// allowed_ids.
static void AddAllScriptsConverted(const UNICHARSET& sid_set,
                                   const UNICHARSET& osd_set,
                                   GenericVector<int>* allowed_ids) {
  for (int i = 0; i < sid_set.get_script_table_size(); ++i) {
    if (i != sid_set.null_sid()) {
      const char* script = sid_set.get_script_from_script_id(i);
      allowed_ids->push_back(osd_set.get_script_id_from_name(script));
    }
  }
}
Exemple #8
0
bool Dict::absolute_garbage(const WERD_CHOICE &word,
                            const UNICHARSET &unicharset) {
  if (word.length() < kMinAbsoluteGarbageWordLength) return false;
  int num_alphanum = 0;
  for (int x = 0; x < word.length(); ++x) {
    num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) ||
                     unicharset.get_isdigit(word.unichar_id(x)));
  }
  return (static_cast<float>(num_alphanum) /
          static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac);
}
// Helper gets the combined x-heights string.
std::string GetXheightString(const std::string& script_dir,
                        const UNICHARSET& unicharset) {
  std::string xheights_str;
  for (int s = 0; s < unicharset.get_script_table_size(); ++s) {
    // Load the xheights for the script if available.
    std::string filename = script_dir + "/" +
                      unicharset.get_script_from_script_id(s) + ".xheights";
    std::string script_heights;
    if (File::ReadFileToString(filename, &script_heights))
      xheights_str += script_heights;
  }
  return xheights_str;
}
Exemple #10
0
// Print the best guesses out of the match rating matrix.
void MATRIX::print(const UNICHARSET &unicharset) const {
  tprintf("Ratings Matrix (top 3 choices)\n");
  int dim = dimension();
  int band_width = bandwidth();
  int row, col;
  for (col = 0; col < dim; ++col) {
    for (row = col; row < dim && row < col + band_width; ++row) {
      BLOB_CHOICE_LIST *rating = this->get(col, row);
      if (rating == NOT_CLASSIFIED) continue;
      BLOB_CHOICE_IT b_it(rating);
      tprintf("col=%d row=%d ", col, row);
      for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
        tprintf("%s rat=%g cert=%g " ,
                unicharset.id_to_unichar(b_it.data()->unichar_id()),
                b_it.data()->rating(), b_it.data()->certainty());
      }
      tprintf("\n");
    }
    tprintf("\n");
  }
  tprintf("\n");
  for (col = 0; col < dim; ++col) tprintf("\t%d", col);
  tprintf("\n");
  for (row = 0; row < dim; ++row) {
    for (col = 0; col <= row; ++col) {
      if (col == 0) tprintf("%d\t", row);
      if (row >= col + band_width) {
        tprintf(" \t");
        continue;
      }
      BLOB_CHOICE_LIST *rating = this->get(col, row);
      if (rating != NOT_CLASSIFIED) {
        BLOB_CHOICE_IT b_it(rating);
        int counter = 0;
        for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
          tprintf("%s ",
                  unicharset.id_to_unichar(b_it.data()->unichar_id()));
          ++counter;
          if (counter == 3) break;
        }
        tprintf("\t");
      } else {
        tprintf(" \t");
      }
    }
    tprintf("\n");
  }
}
/**
 * WERD_CHOICE::WERD_CHOICE
 *
 * Constructor to build a WERD_CHOICE from the given string.
 * The function assumes that src_string is not NULL.
 */
WERD_CHOICE::WERD_CHOICE(const char *src_string,
                         const UNICHARSET &unicharset) {
  STRING src_lengths;
  int len = strlen(src_string);
  const char *ptr = src_string;
  int step = unicharset.step(ptr);
  for (; ptr < src_string + len && step > 0;
       step = unicharset.step(ptr), src_lengths += step, ptr += step);
  if (step != 0 && ptr == src_string + len) {
    this->init(src_string, src_lengths.string(),
               0.0, 0.0, NO_PERM, unicharset);
  } else {  // there must have been an invalid unichar in the string
    this->init(8);
    this->make_bad();
  }
}
/**
 * WERD_CHOICE::init
 *
 * Helper function to build a WERD_CHOICE from the given string,
 * fragment lengths, rating, certainty and permuter.
 *
 * The function assumes that src_string is not NULL.
 * src_lengths argument could be NULL, in which case the unichars
 * in src_string are assumed to all be of length 1.
 */
void WERD_CHOICE::init(const char *src_string,
                       const char *src_lengths,
                       float src_rating,
                       float src_certainty,
                       uinT8 src_permuter,
                       const UNICHARSET &unicharset) {
  int src_string_len = strlen(src_string);
  if (src_string_len == 0) {
    this->init(8);
  } else {
    this->init(src_lengths ? strlen(src_lengths): src_string_len);
    length_ = reserved_;
    int offset = 0;
    for (int i = 0; i < length_; ++i) {
      int unichar_length = src_lengths ? src_lengths[i] : 1;
      unichar_ids_[i] =
          unicharset.unichar_to_id(src_string+offset, unichar_length);
      fragment_lengths_[i] = 1;
      offset += unichar_length;
    }
  }
  rating_ = src_rating;
  certainty_ = src_certainty;
  permuter_ = src_permuter;
}
// For each id in src, if it does not occur in this, add it, as in
// SetPropertiesFromOther, otherwise expand the ranges, as in
// ExpandRangesFromOther.
void UNICHARSET::AppendOtherUnicharset(const UNICHARSET& src) {
  int initial_used = size_used;
  for (int ch = 0; ch < src.size_used; ++ch) {
    const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
    const char* utf8 = src.id_to_unichar(ch);
    if (strcmp(utf8, " ") != 0 && src_props.AnyRangeEmpty()) {
      // Only use fully valid entries.
      tprintf("Bad properties for index %d, char %s: "
              "%d,%d %d,%d %d,%d %d,%d %d,%d\n",
              ch, utf8, src_props.min_bottom, src_props.max_bottom,
              src_props.min_top, src_props.max_top,
              src_props.min_width, src_props.max_width,
              src_props.min_bearing, src_props.max_bearing,
              src_props.min_advance, src_props.max_advance);
      continue;
    }
    int id = size_used;
    if (contains_unichar(utf8)) {
      id = unichar_to_id(utf8);
      // Just expand current ranges.
      unichars[id].properties.ExpandRangesFrom(src_props);
    } else {
      unichar_insert(utf8);
      unichars[id].properties.SetRangesEmpty();
    }
  }
  // Set properties, including mirror and other_case, WITHOUT reordering
  // the unicharset.
  PartialSetPropertiesFromOther(initial_used, src);
}
Exemple #14
0
// Print the best guesses out of the match rating matrix.
void MATRIX::print(const UNICHARSET &unicharset) {
  tprintf("Ratings Matrix (top choices)\n");
  int row, col;
  for (col = 0; col < this->dimension(); ++col) tprintf("\t%d", col);
  tprintf("\n");
  for (row = 0; row < this->dimension(); ++row) {
    for (col = 0; col <= row; ++col) {
      if (col == 0) tprintf("%d\t", row);
      BLOB_CHOICE_LIST *rating = this->get(col, row);
      if (rating != NOT_CLASSIFIED) {
        BLOB_CHOICE_IT b_it(rating);
        int counter = 0;
        for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
          tprintf("%s ", unicharset.id_to_unichar(b_it.data()->unichar_id()));
          ++counter;
          if (counter == 3) break;
        }
        tprintf("\t");
      } else {
        tprintf(" \t");
      }
    }
    tprintf("\n");
  }
}
Exemple #15
0
/**********************************************************************
 * print_ratings_list
 *
 * Send all the ratings out to the logfile.
 **********************************************************************/
void print_ratings_list(
    const char *msg,                      // intro message
    BLOB_CHOICE_LIST *ratings,            // list of results
    const UNICHARSET &current_unicharset  // unicharset that can be used
                                          // for id-to-unichar conversion
    ) {
  if (ratings->length() == 0) {
    tprintf("%s:<none>\n", msg);
    return;
  }
  if (*msg != '\0') {
    tprintf("%s\n", msg);
  }
  BLOB_CHOICE_IT c_it;
  c_it.set_to_list(ratings);
  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
    tprintf("r%.2f c%.2f : %d %s",
            c_it.data()->rating(), c_it.data()->certainty(),
            c_it.data()->unichar_id(),
            current_unicharset.debug_str(c_it.data()->unichar_id()).string());
    if (!c_it.at_last()) {
      tprintf("\n");
    }
  }
  tprintf("\n");
  fflush(stdout);
}
UNICHAR_ID wc_to_unichar_id(const UNICHARSET &unicharset, int wc) {
  UNICHAR uch(wc);
  char *unichar = uch.utf8_str();
  UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar);
  delete[] unichar;
  return unichar_id;
}
Exemple #17
0
int Dawg::check_for_words(const char *filename,
                          const UNICHARSET &unicharset,
                          bool enable_wildcard) const {
  if (filename == nullptr) return 0;

  FILE       *word_file;
  char       string [CHARS_PER_LINE];
  int misses = 0;
  UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard);

  word_file = fopen(filename, "r");
  if (word_file == nullptr) {
    tprintf("Error: Could not open file %s\n", filename);
    ASSERT_HOST(word_file);
  }

  while (fgets (string, CHARS_PER_LINE, word_file) != nullptr) {
    chomp_string(string);  // remove newline
    WERD_CHOICE word(string, unicharset);
    if (word.length() > 0 &&
        !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
      if (!match_words(&word, 0, 0,
                       enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) {
        tprintf("Missing word: %s\n", string);
        ++misses;
      }
    } else {
      tprintf("Failed to create a valid word from %s\n", string);
    }
  }
  fclose (word_file);
  // Make sure the user sees this with fprintf instead of tprintf.
  if (debug_level_) tprintf("Number of lost words=%d\n", misses);
  return misses;
}
Exemple #18
0
int Dict::case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) {
  int state = 0;
  int x;
  for (x = 0; x < word.length(); ++x) {
    UNICHAR_ID ch_id = word.unichar_id(x);
    if (unicharset.get_isupper(ch_id))
      state = case_state_table[state][1];
    else if (unicharset.get_islower(ch_id))
      state = case_state_table[state][2];
    else if (unicharset.get_isdigit(ch_id))
      state = case_state_table[state][3];
    else
      state = case_state_table[state][0];
    if (state == -1) return false;
  }
  return state != 5; // single lower is bad
}
// Expands the tops and bottoms and widths for this unicharset given a
// src unicharset with ranges in it. The unicharsets don't have to be the
// same, and graphemes are correctly accounted for.
void UNICHARSET::ExpandRangesFromOther(const UNICHARSET& src) {
  for (int ch = 0; ch < size_used; ++ch) {
    const char* utf8 = id_to_unichar(ch);
    UNICHAR_PROPERTIES properties;
    if (src.GetStrProperties(utf8, &properties)) {
      // Expand just the ranges from properties.
      unichars[ch].properties.ExpandRangesFrom(properties);
    }
  }
}
Exemple #20
0
/**
 * print_ratings_info
 *
 * Send all the ratings out to the logfile.
 *
 * @param fp file to use
 * @param ratings list of results
 * @param current_unicharset unicharset that can be used
 * for id-to-unichar conversion
 */
void print_ratings_info(FILE *fp,
                        BLOB_CHOICE_LIST *ratings,
                        const UNICHARSET &current_unicharset) {
  inT32 index;                    // to list
  const char* first_char = NULL;  // character
  FLOAT32 first_rat;              // rating
  FLOAT32 first_cert;             // certainty
  const char* sec_char = NULL;    // character
  FLOAT32 sec_rat = 0.0f;         // rating
  FLOAT32 sec_cert = 0.0f;        // certainty
  BLOB_CHOICE_IT c_it = ratings;  // iterator

  index = ratings->length();
  if (index > 0) {
    first_char = current_unicharset.id_to_unichar(c_it.data()->unichar_id());
    first_rat = c_it.data()->rating();
    first_cert = -c_it.data()->certainty();
    if (index > 1) {
      sec_char = current_unicharset.id_to_unichar(
          c_it.data_relative(1)->unichar_id());
      sec_rat = c_it.data_relative(1)->rating();
      sec_cert = -c_it.data_relative(1)->certainty();
    } else {
      sec_char = NULL;
      sec_rat = -1;
      sec_cert = -1;
    }
  } else {
    first_char = NULL;
    first_rat = -1;
    first_cert = -1;
  }
  if (first_char != NULL && (*first_char == '\0' || *first_char == ' '))
    first_char = NULL;
  if (sec_char != NULL && (*sec_char == '\0' || *sec_char == ' '))
    sec_char = NULL;
  tprintf(" " INT32FORMAT " %s %g %g %s %g %g\n",
          ratings->length(),
          first_char != NULL ? first_char : "~",
          first_rat, first_cert, sec_char != NULL ? sec_char : "~",
          sec_rat, sec_cert);
}
// Makes this a copy of src. Clears this completely first, so the automatic
// ids will not be present in this if not in src. Does NOT reorder the set!
void UNICHARSET::CopyFrom(const UNICHARSET& src) {
  clear();
  for (int ch = 0; ch < src.size_used; ++ch) {
    const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
    const char* utf8 = src.id_to_unichar(ch);
    unichar_insert(utf8);
    unichars[ch].properties.ExpandRangesFrom(src_props);
  }
  // Set properties, including mirror and other_case, WITHOUT reordering
  // the unicharset.
  PartialSetPropertiesFromOther(0, src);
}
int main(int argc, char** argv) {
  // Sets properties on the input unicharset file, and writes:
  //   rootdir/lang/lang.charset_size=ddd.txt
  //   rootdir/lang/lang.traineddata
  //   rootdir/lang/lang.unicharset
  // If the 3 word lists are provided, the dawgs are also added
  // to the traineddata file.
  // The output unicharset and charset_size files are just for
  // human readability.
  tesseract::CheckSharedLibraryVersion();
  tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);

  GenericVector<STRING> words, puncs, numbers;
  // If these reads fail, we get a warning message and an empty list of words.
  tesseract::ReadFile(FLAGS_words.c_str(), nullptr).split('\n', &words);
  tesseract::ReadFile(FLAGS_puncs.c_str(), nullptr).split('\n', &puncs);
  tesseract::ReadFile(FLAGS_numbers.c_str(), nullptr).split('\n', &numbers);
  // Load the input unicharset
  UNICHARSET unicharset;
  if (!unicharset.load_from_file(FLAGS_input_unicharset.c_str(), false)) {
    tprintf("Failed to load unicharset from %s\n",
            FLAGS_input_unicharset.c_str());
    return 1;
  }
  tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(),
          FLAGS_input_unicharset.c_str());

  // Set unichar properties
  tprintf("Setting unichar properties\n");
  tesseract::SetupBasicProperties(/*report_errors*/ true,
                                  /*decompose (NFD)*/ false, &unicharset);
  tprintf("Setting script properties\n");
  tesseract::SetScriptProperties(FLAGS_script_dir.c_str(), &unicharset);
  // Combine everything into a traineddata file.
  return tesseract::CombineLangModel(
      unicharset, FLAGS_script_dir.c_str(), FLAGS_version_str.c_str(),
      FLAGS_output_dir.c_str(), FLAGS_lang.c_str(), FLAGS_pass_through_recoder,
      words, puncs, numbers, FLAGS_lang_is_rtl, /*reader*/ nullptr,
      /*writer*/ nullptr);
}
/**
 * string_and_lengths
 *
 * Populates the given word_str with unichars from unichar_ids and
 * and word_lengths_str with the corresponding unichar lengths.
 * Uses current_unicharset to make unichar id -> unichar conversions.
 */
void WERD_CHOICE::string_and_lengths(const UNICHARSET &current_unicharset,
                                     STRING *word_str,
                                     STRING *word_lengths_str) const {
  *word_str = "";
  if (word_lengths_str != NULL) *word_lengths_str = "";
  for (int i = 0; i < length_; ++i) {
    const char *ch = current_unicharset.id_to_unichar(unichar_ids_[i]);
    *word_str += ch;
    if (word_lengths_str != NULL) {
      *word_lengths_str += strlen(ch);
    }
  }
}
Exemple #24
0
bool Wordrec::ChoiceIsCorrect(const UNICHARSET &uni_set,
                              const WERD_CHOICE *choice,
                              const GenericVector<STRING> &truth_text) {
  if (choice == NULL) return false;
  int i;
  STRING truth_str;
  for (i = 0; i < truth_text.length(); ++i) truth_str += truth_text[i];
  STRING normed_choice_str;
  for (i = 0; i < choice->length(); ++i) {
    normed_choice_str += uni_set.get_normed_unichar(choice->unichar_id(i));
  }
  return (truth_str == normed_choice_str);
}
/**
 * print_char_choices_list
 */
void print_char_choices_list(const char *msg,
                             const BLOB_CHOICE_LIST_VECTOR &char_choices,
                             const UNICHARSET &current_unicharset,
                             BOOL8 detailed) {
  if (*msg != '\0') tprintf("%s\n", msg);
  for (int x = 0; x < char_choices.length(); ++x) {
    BLOB_CHOICE_IT c_it;
    c_it.set_to_list(char_choices.get(x));
    tprintf("char[%d]: %s\n", x,
            current_unicharset.debug_str( c_it.data()->unichar_id()).string());
    if (detailed)
      print_ratings_list("  ", char_choices.get(x), current_unicharset);
  }
}
Exemple #26
0
/**
 * WERD_CHOICE::WERD_CHOICE
 *
 * Constructor to build a WERD_CHOICE from the given string.
 * The function assumes that src_string is not NULL.
 */
WERD_CHOICE::WERD_CHOICE(const char *src_string,
                         const UNICHARSET &unicharset)
    : unicharset_(&unicharset){
  GenericVector<UNICHAR_ID> encoding;
  GenericVector<char> lengths;
  if (unicharset.encode_string(src_string, true, &encoding, &lengths, NULL)) {
    lengths.push_back('\0');
    STRING src_lengths = &lengths[0];
    this->init(src_string, src_lengths.string(), 0.0, 0.0, NO_PERM);
  } else {  // There must have been an invalid unichar in the string.
    this->init(8);
    this->make_bad();
  }
}
Exemple #27
0
/* static */
ScriptPos WERD_CHOICE::ScriptPositionOf(bool print_debug,
                                        const UNICHARSET& unicharset,
                                        const TBOX& blob_box,
                                        UNICHAR_ID unichar_id) {
  ScriptPos retval = tesseract::SP_NORMAL;
  int top = blob_box.top();
  int bottom = blob_box.bottom();
  int min_bottom, max_bottom, min_top, max_top;
  unicharset.get_top_bottom(unichar_id,
                            &min_bottom, &max_bottom,
                            &min_top, &max_top);

  int sub_thresh_top = min_top - kMinSubscriptOffset;
  int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset;
  int sup_thresh_bot = max_bottom + kMinSuperscriptOffset;
  if (bottom <= kMaxDropCapBottom) {
    retval = tesseract::SP_DROPCAP;
  } else if (top < sub_thresh_top && bottom < sub_thresh_bot) {
    retval = tesseract::SP_SUBSCRIPT;
  } else if (bottom > sup_thresh_bot) {
    retval = tesseract::SP_SUPERSCRIPT;
  }

  if (print_debug) {
    const char *pos = ScriptPosToString(retval);
    tprintf("%s Character %s[bot:%d top: %d]  "
            "bot_range[%d,%d]  top_range[%d, %d] "
            "sub_thresh[bot:%d top:%d]  sup_thresh_bot %d\n",
            pos, unicharset.id_to_unichar(unichar_id),
            bottom, top,
            min_bottom, max_bottom, min_top, max_top,
            sub_thresh_bot, sub_thresh_top,
            sup_thresh_bot);
  }
  return retval;
}
int main(int argc, char *argv[]) {
  if (argc != 4) {
    tprintf("Print all the words in a given dawg.\n");
    tprintf("Usage: %s <unicharset> <dawgfile> <wordlistfile>\n",
            argv[0]);
    return 1;
  }
  const char *unicharset_file = argv[1];
  const char *dawg_file = argv[2];
  const char *wordlist_file = argv[3];
  UNICHARSET unicharset;
  if (!unicharset.load_from_file(unicharset_file)) {
    tprintf("Error loading unicharset from %s.\n", unicharset_file);
    return 1;
  }
  tesseract::Dawg *dict = LoadSquishedDawg(unicharset, dawg_file);
  if (dict == NULL) {
    tprintf("Error loading dictionary from %s.\n", dawg_file);
    return 1;
  }
  int retval = WriteDawgAsWordlist(unicharset, dict, wordlist_file);
  delete dict;
  return retval;
}
// Helper prints the given set of blob choices.
static void PrintPath(int length, const BLOB_CHOICE** blob_choices,
                      const UNICHARSET& unicharset,
                      const char *label, FILE *output_file) {
  float rating = 0.0f;
  float certainty = 0.0f;
  for (int i = 0; i < length; ++i) {
    const BLOB_CHOICE* blob_choice = blob_choices[i];
    fprintf(output_file, "%s",
           unicharset.id_to_unichar(blob_choice->unichar_id()));
    rating += blob_choice->rating();
    if (certainty > blob_choice->certainty())
      certainty = blob_choice->certainty();
  }
  fprintf(output_file, "\t%s\t%.4f\t%.4f\n",
         label, rating, certainty);
}
Exemple #30
0
    bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set) {
        bool all_one_case = true;
        bool capitalized;
        bool prev_upper;
        bool prev_lower;
        bool first_upper;
        bool first_lower;
        bool cur_upper;
        bool cur_lower;

        string str8;
        if (!char_set) {
            // If cube char_set is missing, use C-locale-dependent functions
            // on UTF8 characters to determine case properties.
            first_upper = isupper(str32[0]);
            first_lower = islower(str32[0]);
            if (first_upper)
                capitalized = true;
            prev_upper = first_upper;
            prev_lower = islower(str32[0]);
            for (int c = 1; str32[c] != 0; ++c) {
                cur_upper = isupper(str32[c]);
                cur_lower = islower(str32[c]);
                if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
                    all_one_case = false;
                if (cur_upper)
                    capitalized = false;
                prev_upper = cur_upper;
                prev_lower = cur_lower;
            }
        } else {
            UNICHARSET *unicharset = char_set->InternalUnicharset();
            // Use UNICHARSET functions to determine case properties
            first_upper = unicharset->get_isupper(char_set->ClassID(str32[0]));
            first_lower = unicharset->get_islower(char_set->ClassID(str32[0]));
            if (first_upper)
                capitalized = true;
            prev_upper = first_upper;
            prev_lower = unicharset->get_islower(char_set->ClassID(str32[0]));

            for (int c = 1; c < StrLen(str32); ++c) {
                cur_upper = unicharset->get_isupper(char_set->ClassID(str32[c]));
                cur_lower = unicharset->get_islower(char_set->ClassID(str32[c]));
                if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
                    all_one_case = false;
                if (cur_upper)
                    capitalized = false;
                prev_upper = cur_upper;
                prev_lower = cur_lower;
            }
        }
        return all_one_case || capitalized;
    }