bool Dict::absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset) { if (word.length() < kMinAbsoluteGarbageWordLength) return false; int num_alphanum = 0; for (int x = 0; x < word.length(); ++x) { num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) || unicharset.get_isdigit(word.unichar_id(x))); } return (static_cast<float>(num_alphanum) / static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac); }
bool Dict::AcceptableChoice(const WERD_CHOICE& best_choice, XHeightConsistencyEnum xheight_consistency) { float CertaintyThreshold = stopper_nondict_certainty_base; int WordSize; if (stopper_no_acceptable_choices) return false; if (best_choice.length() == 0) return false; bool no_dang_ambigs = !best_choice.dangerous_ambig_found(); bool is_valid_word = valid_word_permuter(best_choice.permuter(), false); bool is_case_ok = case_ok(best_choice, getUnicharset()); if (stopper_debug_level >= 1) { const char *xht = "UNKNOWN"; switch (xheight_consistency) { case XH_GOOD: xht = "NORMAL"; break; case XH_SUBNORMAL: xht = "SUBNORMAL"; break; case XH_INCONSISTENT: xht = "INCONSISTENT"; break; default: xht = "UNKNOWN"; } tprintf("\nStopper: %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n", best_choice.unichar_string().string(), (is_valid_word ? 'y' : 'n'), (is_case_ok ? 'y' : 'n'), xht, best_choice.min_x_height(), best_choice.max_x_height()); } // Do not accept invalid words in PASS1. if (reject_offset_ <= 0.0f && !is_valid_word) return false; if (is_valid_word && is_case_ok) { WordSize = LengthOfShortestAlphaRun(best_choice); WordSize -= stopper_smallword_size; if (WordSize < 0) WordSize = 0; CertaintyThreshold += WordSize * stopper_certainty_per_char; } if (stopper_debug_level >= 1) tprintf("Stopper: Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n", best_choice.rating(), best_choice.certainty(), CertaintyThreshold); if (no_dang_ambigs && best_choice.certainty() > CertaintyThreshold && xheight_consistency < XH_INCONSISTENT && UniformCertainties(best_choice)) { return true; } else { if (stopper_debug_level >= 1) { tprintf("AcceptableChoice() returned false" " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n", no_dang_ambigs, best_choice.certainty(), CertaintyThreshold, UniformCertainties(best_choice)); } return false; } }
inT16 Tesseract::count_alphas(const WERD_CHOICE &word) { int count = 0; for (int i = 0; i < word.length(); ++i) { if (word.unicharset()->get_isalpha(word.unichar_id(i))) count++; } return count; }
bool Dawg::prefix_in_dawg(const WERD_CHOICE &word, bool requires_complete) const { if (word.length() == 0) return !requires_complete; NODE_REF node = 0; int end_index = word.length() - 1; for (int i = 0; i < end_index; i++) { EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false); if (edge == NO_EDGE) { return false; } if ((node = next_node(edge)) == 0) { // This only happens if all words following this edge terminate -- // there are no larger words. See Trie::add_word_to_dawg() return false; } } // Now check the last character. return edge_char_of(node, word.unichar_id(end_index), requires_complete) != NO_EDGE; }
int Dict::UniformCertainties(const WERD_CHOICE& word) { float Certainty; float WorstCertainty = MAX_FLOAT32; float CertaintyThreshold; FLOAT64 TotalCertainty; FLOAT64 TotalCertaintySquared; FLOAT64 Variance; FLOAT32 Mean, StdDev; int word_length = word.length(); if (word_length < 3) return true; TotalCertainty = TotalCertaintySquared = 0.0; for (int i = 0; i < word_length; ++i) { Certainty = word.certainty(i); TotalCertainty += Certainty; TotalCertaintySquared += Certainty * Certainty; if (Certainty < WorstCertainty) WorstCertainty = Certainty; } // Subtract off worst certainty from statistics. word_length--; TotalCertainty -= WorstCertainty; TotalCertaintySquared -= WorstCertainty * WorstCertainty; Mean = TotalCertainty / word_length; Variance = ((word_length * TotalCertaintySquared - TotalCertainty * TotalCertainty) / (word_length * (word_length - 1))); if (Variance < 0.0) Variance = 0.0; StdDev = sqrt(Variance); CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev; if (CertaintyThreshold > stopper_nondict_certainty_base) CertaintyThreshold = stopper_nondict_certainty_base; if (word.certainty() < CertaintyThreshold) { if (stopper_debug_level >= 1) tprintf("Stopper: Non-uniform certainty = %4.1f" " (m=%4.1f, s=%4.1f, t=%4.1f)\n", word.certainty(), Mean, StdDev, CertaintyThreshold); return false; } else { return true; } }
int Dict::case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) { int state = 0; int x; for (x = 0; x < word.length(); ++x) { UNICHAR_ID ch_id = word.unichar_id(x); if (unicharset.get_isupper(ch_id)) state = case_state_table[state][1]; else if (unicharset.get_islower(ch_id)) state = case_state_table[state][2]; else if (unicharset.get_isdigit(ch_id)) state = case_state_table[state][3]; else state = case_state_table[state][0]; if (state == -1) return false; } return state != 5; // single lower is bad }
int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const { int shortest = INT32_MAX; int curr_len = 0; for (int w = 0; w < WordChoice.length(); ++w) { if (getUnicharset().get_isalpha(WordChoice.unichar_id(w))) { curr_len++; } else if (curr_len > 0) { if (curr_len < shortest) shortest = curr_len; curr_len = 0; } } if (curr_len > 0 && curr_len < shortest) { shortest = curr_len; } else if (shortest == INT32_MAX) { shortest = 0; } return shortest; }
bool Dict::AcceptableResult(const WERD_CHOICE &BestChoice) { float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_; int WordSize; if (stopper_debug_level >= 1) { tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c)\n", BestChoice.debug_string(getUnicharset()).string(), (valid_word(BestChoice) ? 'y' : 'n'), (case_ok(BestChoice, getUnicharset()) ? 'y' : 'n'), ((list_rest (best_choices_) != NIL_LIST) ? 'n' : 'y')); } if (BestChoice.length() == 0 || CurrentWordAmbig()) return false; if (BestChoice.fragment_mark()) { if (stopper_debug_level >= 1) { cprintf("AcceptableResult(): a choice with fragments beats BestChoice\n"); } return false; } if (valid_word(BestChoice) && case_ok(BestChoice, getUnicharset())) { WordSize = LengthOfShortestAlphaRun(BestChoice); WordSize -= stopper_smallword_size; if (WordSize < 0) WordSize = 0; CertaintyThreshold += WordSize * stopper_certainty_per_char; } if (stopper_debug_level >= 1) cprintf ("Rejecter: Certainty = %4.1f, Threshold = %4.1f ", BestChoice.certainty(), CertaintyThreshold); if (BestChoice.certainty() > CertaintyThreshold && !stopper_no_acceptable_choices) { if (stopper_debug_level >= 1) cprintf("ACCEPTED\n"); return true; } else { if (stopper_debug_level >= 1) cprintf("REJECTED\n"); return false; } }