Example #1
0
bool Dict::absolute_garbage(const WERD_CHOICE &word,
                            const UNICHARSET &unicharset) {
  if (word.length() < kMinAbsoluteGarbageWordLength) return false;
  int num_alphanum = 0;
  for (int x = 0; x < word.length(); ++x) {
    num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) ||
                     unicharset.get_isdigit(word.unichar_id(x)));
  }
  return (static_cast<float>(num_alphanum) /
          static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac);
}
Example #2
0
bool Dict::AcceptableChoice(const WERD_CHOICE& best_choice,
                            XHeightConsistencyEnum xheight_consistency) {
  float CertaintyThreshold = stopper_nondict_certainty_base;
  int WordSize;

  if (stopper_no_acceptable_choices) return false;

  if (best_choice.length() == 0) return false;

  bool no_dang_ambigs = !best_choice.dangerous_ambig_found();
  bool is_valid_word = valid_word_permuter(best_choice.permuter(), false);
  bool is_case_ok = case_ok(best_choice, getUnicharset());

  if (stopper_debug_level >= 1) {
    const char *xht = "UNKNOWN";
    switch (xheight_consistency) {
      case XH_GOOD:  xht = "NORMAL"; break;
      case XH_SUBNORMAL:  xht = "SUBNORMAL"; break;
      case XH_INCONSISTENT:  xht = "INCONSISTENT"; break;
      default: xht = "UNKNOWN";
    }
    tprintf("\nStopper:  %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n",
            best_choice.unichar_string().string(),
            (is_valid_word ? 'y' : 'n'),
            (is_case_ok ? 'y' : 'n'),
            xht,
            best_choice.min_x_height(),
            best_choice.max_x_height());
  }
  // Do not accept invalid words in PASS1.
  if (reject_offset_ <= 0.0f && !is_valid_word) return false;
  if (is_valid_word && is_case_ok) {
    WordSize = LengthOfShortestAlphaRun(best_choice);
    WordSize -= stopper_smallword_size;
    if (WordSize < 0)
      WordSize = 0;
    CertaintyThreshold += WordSize * stopper_certainty_per_char;
  }

  if (stopper_debug_level >= 1)
    tprintf("Stopper:  Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n",
            best_choice.rating(), best_choice.certainty(), CertaintyThreshold);

  if (no_dang_ambigs &&
      best_choice.certainty() > CertaintyThreshold &&
      xheight_consistency < XH_INCONSISTENT &&
      UniformCertainties(best_choice)) {
    return true;
  } else {
    if (stopper_debug_level >= 1) {
      tprintf("AcceptableChoice() returned false"
              " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n",
              no_dang_ambigs, best_choice.certainty(),
              CertaintyThreshold,
              UniformCertainties(best_choice));
    }
    return false;
  }
}
Example #3
0
 inT16 Tesseract::count_alphas(const WERD_CHOICE &word) {
     int count = 0;
     for (int i = 0; i < word.length(); ++i) {
         if (word.unicharset()->get_isalpha(word.unichar_id(i)))
             count++;
     }
     return count;
 }
Example #4
0
bool Dawg::prefix_in_dawg(const WERD_CHOICE &word,
                          bool requires_complete) const {
  if (word.length() == 0) return !requires_complete;
  NODE_REF node = 0;
  int end_index = word.length() - 1;
  for (int i = 0; i < end_index; i++) {
    EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false);
    if (edge == NO_EDGE) {
      return false;
    }
    if ((node = next_node(edge)) == 0) {
      // This only happens if all words following this edge terminate --
      // there are no larger words.  See Trie::add_word_to_dawg()
      return false;
    }
  }
  // Now check the last character.
  return edge_char_of(node, word.unichar_id(end_index), requires_complete) !=
      NO_EDGE;
}
Example #5
0
int Dict::UniformCertainties(const WERD_CHOICE& word) {
  float Certainty;
  float WorstCertainty = MAX_FLOAT32;
  float CertaintyThreshold;
  FLOAT64 TotalCertainty;
  FLOAT64 TotalCertaintySquared;
  FLOAT64 Variance;
  FLOAT32 Mean, StdDev;
  int word_length = word.length();

  if (word_length < 3)
    return true;

  TotalCertainty = TotalCertaintySquared = 0.0;
  for (int i = 0; i < word_length; ++i) {
    Certainty = word.certainty(i);
    TotalCertainty += Certainty;
    TotalCertaintySquared += Certainty * Certainty;
    if (Certainty < WorstCertainty)
      WorstCertainty = Certainty;
  }

  // Subtract off worst certainty from statistics.
  word_length--;
  TotalCertainty -= WorstCertainty;
  TotalCertaintySquared -= WorstCertainty * WorstCertainty;

  Mean = TotalCertainty / word_length;
  Variance = ((word_length * TotalCertaintySquared -
    TotalCertainty * TotalCertainty) /
    (word_length * (word_length - 1)));
  if (Variance < 0.0)
    Variance = 0.0;
  StdDev = sqrt(Variance);

  CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
  if (CertaintyThreshold > stopper_nondict_certainty_base)
    CertaintyThreshold = stopper_nondict_certainty_base;

  if (word.certainty() < CertaintyThreshold) {
    if (stopper_debug_level >= 1)
      tprintf("Stopper: Non-uniform certainty = %4.1f"
              " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
              word.certainty(), Mean, StdDev, CertaintyThreshold);
    return false;
  } else {
    return true;
  }
}
Example #6
0
int Dict::case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) {
  int state = 0;
  int x;
  for (x = 0; x < word.length(); ++x) {
    UNICHAR_ID ch_id = word.unichar_id(x);
    if (unicharset.get_isupper(ch_id))
      state = case_state_table[state][1];
    else if (unicharset.get_islower(ch_id))
      state = case_state_table[state][2];
    else if (unicharset.get_isdigit(ch_id))
      state = case_state_table[state][3];
    else
      state = case_state_table[state][0];
    if (state == -1) return false;
  }
  return state != 5; // single lower is bad
}
Example #7
0
int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const {
  int shortest = INT32_MAX;
  int curr_len = 0;
  for (int w = 0; w < WordChoice.length(); ++w) {
    if (getUnicharset().get_isalpha(WordChoice.unichar_id(w))) {
      curr_len++;
    } else if (curr_len > 0) {
      if (curr_len < shortest) shortest = curr_len;
      curr_len = 0;
    }
  }
  if (curr_len > 0 && curr_len < shortest) {
    shortest = curr_len;
  } else if (shortest == INT32_MAX) {
    shortest = 0;
  }
  return shortest;
}
Example #8
0
bool Dict::AcceptableResult(const WERD_CHOICE &BestChoice) {
  float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
  int WordSize;

  if (stopper_debug_level >= 1) {
    tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c)\n",
            BestChoice.debug_string(getUnicharset()).string(),
            (valid_word(BestChoice) ? 'y' : 'n'),
            (case_ok(BestChoice, getUnicharset()) ? 'y' : 'n'),
            ((list_rest (best_choices_) != NIL_LIST) ? 'n' : 'y'));
  }

  if (BestChoice.length() == 0 || CurrentWordAmbig())
    return false;
  if (BestChoice.fragment_mark()) {
    if (stopper_debug_level >= 1) {
      cprintf("AcceptableResult(): a choice with fragments beats BestChoice\n");
    }
    return false;
  }
  if (valid_word(BestChoice) && case_ok(BestChoice, getUnicharset())) {
    WordSize = LengthOfShortestAlphaRun(BestChoice);
    WordSize -= stopper_smallword_size;
    if (WordSize < 0)
      WordSize = 0;
    CertaintyThreshold += WordSize * stopper_certainty_per_char;
  }

  if (stopper_debug_level >= 1)
    cprintf ("Rejecter: Certainty = %4.1f, Threshold = %4.1f   ",
      BestChoice.certainty(), CertaintyThreshold);

  if (BestChoice.certainty() > CertaintyThreshold &&
      !stopper_no_acceptable_choices) {
    if (stopper_debug_level >= 1)
      cprintf("ACCEPTED\n");
    return true;
  }
  else {
    if (stopper_debug_level >= 1)
      cprintf("REJECTED\n");
    return false;
  }
}