// Make a text string from the internal data structures. // The input page_res is deleted. char* TessBaseAPI::TesseractToText(PAGE_RES* page_res) { if (page_res != NULL) { int total_length = TextLength(page_res); PAGE_RES_IT page_res_it(page_res); char* result = new char[total_length]; char* ptr = result; for (page_res_it.restart_page(); page_res_it.word () != NULL; page_res_it.forward()) { WERD_RES *word = page_res_it.word(); WERD_CHOICE* choice = word->best_choice; if (choice != NULL) { strcpy(ptr, choice->string().string()); ptr += strlen(ptr); if (word->word->flag(W_EOL)) *ptr++ = '\n'; else *ptr++ = ' '; } } *ptr++ = '\n'; *ptr = '\0'; delete page_res; return result; } return NULL; }
bool Dict::AcceptableChoice(const WERD_CHOICE& best_choice, XHeightConsistencyEnum xheight_consistency) { float CertaintyThreshold = stopper_nondict_certainty_base; int WordSize; if (stopper_no_acceptable_choices) return false; if (best_choice.length() == 0) return false; bool no_dang_ambigs = !best_choice.dangerous_ambig_found(); bool is_valid_word = valid_word_permuter(best_choice.permuter(), false); bool is_case_ok = case_ok(best_choice, getUnicharset()); if (stopper_debug_level >= 1) { const char *xht = "UNKNOWN"; switch (xheight_consistency) { case XH_GOOD: xht = "NORMAL"; break; case XH_SUBNORMAL: xht = "SUBNORMAL"; break; case XH_INCONSISTENT: xht = "INCONSISTENT"; break; default: xht = "UNKNOWN"; } tprintf("\nStopper: %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n", best_choice.unichar_string().string(), (is_valid_word ? 'y' : 'n'), (is_case_ok ? 'y' : 'n'), xht, best_choice.min_x_height(), best_choice.max_x_height()); } // Do not accept invalid words in PASS1. if (reject_offset_ <= 0.0f && !is_valid_word) return false; if (is_valid_word && is_case_ok) { WordSize = LengthOfShortestAlphaRun(best_choice); WordSize -= stopper_smallword_size; if (WordSize < 0) WordSize = 0; CertaintyThreshold += WordSize * stopper_certainty_per_char; } if (stopper_debug_level >= 1) tprintf("Stopper: Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n", best_choice.rating(), best_choice.certainty(), CertaintyThreshold); if (no_dang_ambigs && best_choice.certainty() > CertaintyThreshold && xheight_consistency < XH_INCONSISTENT && UniformCertainties(best_choice)) { return true; } else { if (stopper_debug_level >= 1) { tprintf("AcceptableChoice() returned false" " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n", no_dang_ambigs, best_choice.certainty(), CertaintyThreshold, UniformCertainties(best_choice)); } return false; } }
inT16 Tesseract::count_alphas(const WERD_CHOICE &word) { int count = 0; for (int i = 0; i < word.length(); ++i) { if (word.unicharset()->get_isalpha(word.unichar_id(i))) count++; } return count; }
bool Dict::absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset) { if (word.length() < kMinAbsoluteGarbageWordLength) return false; int num_alphanum = 0; for (int x = 0; x < word.length(); ++x) { num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) || unicharset.get_isdigit(word.unichar_id(x))); } return (static_cast<float>(num_alphanum) / static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac); }
int Dict::UniformCertainties(const WERD_CHOICE& word) { float Certainty; float WorstCertainty = MAX_FLOAT32; float CertaintyThreshold; FLOAT64 TotalCertainty; FLOAT64 TotalCertaintySquared; FLOAT64 Variance; FLOAT32 Mean, StdDev; int word_length = word.length(); if (word_length < 3) return true; TotalCertainty = TotalCertaintySquared = 0.0; for (int i = 0; i < word_length; ++i) { Certainty = word.certainty(i); TotalCertainty += Certainty; TotalCertaintySquared += Certainty * Certainty; if (Certainty < WorstCertainty) WorstCertainty = Certainty; } // Subtract off worst certainty from statistics. word_length--; TotalCertainty -= WorstCertainty; TotalCertaintySquared -= WorstCertainty * WorstCertainty; Mean = TotalCertainty / word_length; Variance = ((word_length * TotalCertaintySquared - TotalCertainty * TotalCertainty) / (word_length * (word_length - 1))); if (Variance < 0.0) Variance = 0.0; StdDev = sqrt(Variance); CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev; if (CertaintyThreshold > stopper_nondict_certainty_base) CertaintyThreshold = stopper_nondict_certainty_base; if (word.certainty() < CertaintyThreshold) { if (stopper_debug_level >= 1) tprintf("Stopper: Non-uniform certainty = %4.1f" " (m=%4.1f, s=%4.1f, t=%4.1f)\n", word.certainty(), Mean, StdDev, CertaintyThreshold); return false; } else { return true; } }
// Update hyphen_word_, and copy the given DawgPositionVectors into // hyphen_active_dawgs_. void Dict::set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs) { if (hyphen_word_ == NULL) { hyphen_word_ = new WERD_CHOICE(word.unicharset()); hyphen_word_->make_bad(); } if (hyphen_word_->rating() > word.rating()) { *hyphen_word_ = word; // Remove the last unichar id as it is a hyphen, and remove // any unichar_string/lengths that are present. hyphen_word_->remove_last_unichar_id(); hyphen_active_dawgs_ = active_dawgs; } if (hyphen_debug_level) { hyphen_word_->print("set_hyphen_word: "); } }
int Dict::case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) { int state = 0; int x; for (x = 0; x < word.length(); ++x) { UNICHAR_ID ch_id = word.unichar_id(x); if (unicharset.get_isupper(ch_id)) state = case_state_table[state][1]; else if (unicharset.get_islower(ch_id)) state = case_state_table[state][2]; else if (unicharset.get_isdigit(ch_id)) state = case_state_table[state][3]; else state = case_state_table[state][0]; if (state == -1) return false; } return state != 5; // single lower is bad }
int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const { int shortest = INT32_MAX; int curr_len = 0; for (int w = 0; w < WordChoice.length(); ++w) { if (getUnicharset().get_isalpha(WordChoice.unichar_id(w))) { curr_len++; } else if (curr_len > 0) { if (curr_len < shortest) shortest = curr_len; curr_len = 0; } } if (curr_len > 0 && curr_len < shortest) { shortest = curr_len; } else if (shortest == INT32_MAX) { shortest = 0; } return shortest; }
// Return the maximum length that the output text string might occupy. int TessBaseAPI::TextLength(PAGE_RES* page_res) { PAGE_RES_IT page_res_it(page_res); int total_length = 2; // Iterate over the data structures to extract the recognition result. for (page_res_it.restart_page(); page_res_it.word () != NULL; page_res_it.forward()) { WERD_RES *word = page_res_it.word(); WERD_CHOICE* choice = word->best_choice; if (choice != NULL) { total_length += choice->string().length() + 1; for (int i = 0; i < word->reject_map.length(); ++i) { if (word->reject_map[i].rejected()) ++total_length; } } } return total_length; }
bool Dict::AcceptableResult(const WERD_CHOICE &BestChoice) { float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_; int WordSize; if (stopper_debug_level >= 1) { tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c)\n", BestChoice.debug_string(getUnicharset()).string(), (valid_word(BestChoice) ? 'y' : 'n'), (case_ok(BestChoice, getUnicharset()) ? 'y' : 'n'), ((list_rest (best_choices_) != NIL_LIST) ? 'n' : 'y')); } if (BestChoice.length() == 0 || CurrentWordAmbig()) return false; if (BestChoice.fragment_mark()) { if (stopper_debug_level >= 1) { cprintf("AcceptableResult(): a choice with fragments beats BestChoice\n"); } return false; } if (valid_word(BestChoice) && case_ok(BestChoice, getUnicharset())) { WordSize = LengthOfShortestAlphaRun(BestChoice); WordSize -= stopper_smallword_size; if (WordSize < 0) WordSize = 0; CertaintyThreshold += WordSize * stopper_certainty_per_char; } if (stopper_debug_level >= 1) cprintf ("Rejecter: Certainty = %4.1f, Threshold = %4.1f ", BestChoice.certainty(), CertaintyThreshold); if (BestChoice.certainty() > CertaintyThreshold && !stopper_no_acceptable_choices) { if (stopper_debug_level >= 1) cprintf("ACCEPTED\n"); return true; } else { if (stopper_debug_level >= 1) cprintf("REJECTED\n"); return false; } }
bool Dawg::prefix_in_dawg(const WERD_CHOICE &word, bool requires_complete) const { if (word.length() == 0) return !requires_complete; NODE_REF node = 0; int end_index = word.length() - 1; for (int i = 0; i < end_index; i++) { EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false); if (edge == NO_EDGE) { return false; } if ((node = next_node(edge)) == 0) { // This only happens if all words following this edge terminate -- // there are no larger words. See Trie::add_word_to_dawg() return false; } } // Now check the last character. return edge_char_of(node, word.unichar_id(end_index), requires_complete) != NO_EDGE; }
// Returns an array of all word confidences, terminated by -1. int* TessBaseAPI::AllTextConfidences(PAGE_RES* page_res) { if (!page_res) return NULL; int n_word = 0; PAGE_RES_IT res_it(page_res); for (res_it.restart_page(); res_it.word () != NULL; res_it.forward()) n_word++; int* conf = new int[n_word+1]; n_word = 0; for (res_it.restart_page(); res_it.word () != NULL; res_it.forward()) { WERD_RES *word = res_it.word(); WERD_CHOICE* choice = word->best_choice; int w_conf = static_cast<int>(100 + 5 * choice->certainty()); // This is the eq for converting Tesseract confidence to 1..100 if (w_conf < 0) w_conf = 0; if (w_conf > 100) w_conf = 100; conf[n_word++] = w_conf; } conf[n_word] = -1; return conf; }
/** Creates a fake best_choice entry in each WERD_RES with the correct text.*/ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) { PAGE_RES_IT pr_it(page_res); for (WERD_RES *word_res = pr_it.word(); word_res != NULL; word_res = pr_it.forward()) { WERD_CHOICE* choice = new WERD_CHOICE(word_res->uch_set, word_res->correct_text.size()); for (int i = 0; i < word_res->correct_text.size(); ++i) { // The part before the first space is the real ground truth, and the // rest is the bounding box location and page number. GenericVector<STRING> tokens; word_res->correct_text[i].split(' ', &tokens); UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string()); choice->append_unichar_id_space_allocated(char_id, word_res->best_state[i], 0.0f, 0.0f); } word_res->ClearWordChoices(); word_res->LogNewRawChoice(choice); word_res->LogNewCookedChoice(1, false, choice); } }
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2) { const UNICHARSET *uchset = word1.unicharset(); if (word2.unicharset() != uchset) return false; int w1start, w1end; word1.punct_stripped(&w1start, &w1end); int w2start, w2end; word2.punct_stripped(&w2start, &w2end); if (w1end - w1start != w2end - w2start) return false; for (int i = 0; i < w1end - w1start; i++) { if (uchset->to_lower(word1.unichar_id(w1start + i)) != uchset->to_lower(word2.unichar_id(w2start + i))) { return false; } } return true; }