Example #1
0
 inT16 Tesseract::count_alphas(const WERD_CHOICE &word) {
     int count = 0;
     for (int i = 0; i < word.length(); ++i) {
         if (word.unicharset()->get_isalpha(word.unichar_id(i)))
             count++;
     }
     return count;
 }
Example #2
0
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1,
                                       const WERD_CHOICE &word2) {
  const UNICHARSET *uchset = word1.unicharset();
  if (word2.unicharset() != uchset) return false;
  int w1start, w1end;
  word1.punct_stripped(&w1start, &w1end);
  int w2start, w2end;
  word2.punct_stripped(&w2start, &w2end);
  if (w1end - w1start != w2end - w2start) return false;
  for (int i = 0; i < w1end - w1start; i++) {
    if (uchset->to_lower(word1.unichar_id(w1start + i)) !=
        uchset->to_lower(word2.unichar_id(w2start + i))) {
        return false;
    }
  }
  return true;
}
Example #3
0
bool Dawg::prefix_in_dawg(const WERD_CHOICE &word,
                          bool requires_complete) const {
  if (word.length() == 0) return !requires_complete;
  NODE_REF node = 0;
  int end_index = word.length() - 1;
  for (int i = 0; i < end_index; i++) {
    EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false);
    if (edge == NO_EDGE) {
      return false;
    }
    if ((node = next_node(edge)) == 0) {
      // This only happens if all words following this edge terminate --
      // there are no larger words.  See Trie::add_word_to_dawg()
      return false;
    }
  }
  // Now check the last character.
  return edge_char_of(node, word.unichar_id(end_index), requires_complete) !=
      NO_EDGE;
}
Example #4
0
bool Dict::absolute_garbage(const WERD_CHOICE &word,
                            const UNICHARSET &unicharset) {
  if (word.length() < kMinAbsoluteGarbageWordLength) return false;
  int num_alphanum = 0;
  for (int x = 0; x < word.length(); ++x) {
    num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) ||
                     unicharset.get_isdigit(word.unichar_id(x)));
  }
  return (static_cast<float>(num_alphanum) /
          static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac);
}
Example #5
0
int Dict::case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) {
  int state = 0;
  int x;
  for (x = 0; x < word.length(); ++x) {
    UNICHAR_ID ch_id = word.unichar_id(x);
    if (unicharset.get_isupper(ch_id))
      state = case_state_table[state][1];
    else if (unicharset.get_islower(ch_id))
      state = case_state_table[state][2];
    else if (unicharset.get_isdigit(ch_id))
      state = case_state_table[state][3];
    else
      state = case_state_table[state][0];
    if (state == -1) return false;
  }
  return state != 5; // single lower is bad
}
Example #6
0
int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const {
  int shortest = INT32_MAX;
  int curr_len = 0;
  for (int w = 0; w < WordChoice.length(); ++w) {
    if (getUnicharset().get_isalpha(WordChoice.unichar_id(w))) {
      curr_len++;
    } else if (curr_len > 0) {
      if (curr_len < shortest) shortest = curr_len;
      curr_len = 0;
    }
  }
  if (curr_len > 0 && curr_len < shortest) {
    shortest = curr_len;
  } else if (shortest == INT32_MAX) {
    shortest = 0;
  }
  return shortest;
}