BOOL8 word_adaptable( //should we adapt? WERD_RES *word, uinT16 mode) { BOOL8 status = FALSE; BITS16 flags(mode); enum MODES { ADAPTABLE_WERD, ACCEPTABLE_WERD, CHECK_DAWGS, CHECK_SPACES, CHECK_ONE_ELL_CONFLICT, CHECK_AMBIG_WERD }; /* 0: NO adaption */ if (mode == 0) { return FALSE; } if (flags.bit (ADAPTABLE_WERD)) status |= word->tess_would_adapt; if (flags.bit (ACCEPTABLE_WERD)) status |= word->tess_accepted; if (!status) // If not set then return FALSE; // ignore other checks if (flags.bit (CHECK_DAWGS) && (word->best_choice->permuter () != SYSTEM_DAWG_PERM) && (word->best_choice->permuter () != FREQ_DAWG_PERM) && (word->best_choice->permuter () != USER_DAWG_PERM) && (word->best_choice->permuter () != NUMBER_PERM)) return FALSE; if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, FALSE)) return FALSE; if (flags.bit (CHECK_SPACES) && (strchr (word->best_choice->string ().string (), ' ') != NULL)) return FALSE; // if (flags.bit (CHECK_AMBIG_WERD) && test_ambig_word (word)) if (flags.bit (CHECK_AMBIG_WERD) && !NoDangerousAmbig(word->best_choice->string().string(), word->best_choice->lengths().string(), NULL)) return FALSE; return status; }
void Tesseract::set_done(WERD_RES *word, inT16 pass) { word->done = word->tess_accepted && (strchr(word->best_choice->unichar_string().string(), ' ') == NULL); bool word_is_ambig = word->best_choice->dangerous_ambig_found(); bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM || word->best_choice->permuter() == FREQ_DAWG_PERM || word->best_choice->permuter() == USER_DAWG_PERM; if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) && one_ell_conflict(word, FALSE)) { if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n"); word->done = FALSE; } if (word->done && ((!word_from_dict && word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) { if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n"); word->done = FALSE; } if (tessedit_rejection_debug) { tprintf("set_done(): done=%d\n", word->done); word->best_choice->print(""); } }
BOOL8 Tesseract::word_adaptable( //should we adapt? WERD_RES *word, uinT16 mode) { if (tessedit_adaption_debug) { tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n", word->best_choice == NULL ? "" : word->best_choice->unichar_string().string(), word->best_choice->rating(), word->best_choice->certainty()); } BOOL8 status = FALSE; BITS16 flags(mode); enum MODES { ADAPTABLE_WERD, ACCEPTABLE_WERD, CHECK_DAWGS, CHECK_SPACES, CHECK_ONE_ELL_CONFLICT, CHECK_AMBIG_WERD }; /* 0: NO adaption */ if (mode == 0) { if (tessedit_adaption_debug) tprintf("adaption disabled\n"); return FALSE; } if (flags.bit (ADAPTABLE_WERD)) { status |= word->tess_would_adapt; // result of Classify::AdaptableWord() if (tessedit_adaption_debug && !status) { tprintf("tess_would_adapt bit is false\n"); } } if (flags.bit (ACCEPTABLE_WERD)) { status |= word->tess_accepted; if (tessedit_adaption_debug && !status) { tprintf("tess_accepted bit is false\n"); } } if (!status) { // If not set then return FALSE; // ignore other checks } if (flags.bit (CHECK_DAWGS) && (word->best_choice->permuter () != SYSTEM_DAWG_PERM) && (word->best_choice->permuter () != FREQ_DAWG_PERM) && (word->best_choice->permuter () != USER_DAWG_PERM) && (word->best_choice->permuter () != NUMBER_PERM)) { if (tessedit_adaption_debug) tprintf("word not in dawgs\n"); return FALSE; } if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, FALSE)) { if (tessedit_adaption_debug) tprintf("word has ell conflict\n"); return FALSE; } if (flags.bit (CHECK_SPACES) && (strchr(word->best_choice->unichar_string().string(), ' ') != NULL)) { if (tessedit_adaption_debug) tprintf("word contains spaces\n"); return FALSE; } if (flags.bit (CHECK_AMBIG_WERD) && word->best_choice->dangerous_ambig_found()) { if (tessedit_adaption_debug) tprintf("word is ambiguous\n"); return FALSE; } if (tessedit_adaption_debug) { tprintf("returning status %d\n", status); } return status; }
BOOL8 Tesseract::word_adaptable( //should we adapt? WERD_RES *word, uinT16 mode) { if (tessedit_adaption_debug) { tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n", word->best_choice == NULL ? "" : word->best_choice->unichar_string().string(), word->best_choice->rating(), word->best_choice->certainty()); } BOOL8 status = FALSE; BITS16 flags(mode); enum MODES { ADAPTABLE_WERD, ACCEPTABLE_WERD, CHECK_DAWGS, CHECK_SPACES, CHECK_ONE_ELL_CONFLICT, CHECK_AMBIG_WERD }; /* 0: NO adaption */ if (mode == 0) { if (tessedit_adaption_debug) tprintf("adaption disabled\n"); return FALSE; } if (flags.bit (ADAPTABLE_WERD)) { status |= word->tess_would_adapt; // result of Classify::AdaptableWord() if (tessedit_adaption_debug && !status) { tprintf("tess_would_adapt bit is false\n"); } } if (flags.bit (ACCEPTABLE_WERD)) { status |= word->tess_accepted; if (tessedit_adaption_debug && !status) { tprintf("tess_accepted bit is false\n"); } } if (!status) { // If not set then return FALSE; // ignore other checks } if (flags.bit (CHECK_DAWGS) && (word->best_choice->permuter () != SYSTEM_DAWG_PERM) && (word->best_choice->permuter () != FREQ_DAWG_PERM) && (word->best_choice->permuter () != USER_DAWG_PERM) && (word->best_choice->permuter () != NUMBER_PERM)) { if (tessedit_adaption_debug) tprintf("word not in dawgs\n"); return FALSE; } if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, FALSE)) { if (tessedit_adaption_debug) tprintf("word has ell conflict\n"); return FALSE; } if (flags.bit (CHECK_SPACES) && (strchr(word->best_choice->unichar_string().string(), ' ') != NULL)) { if (tessedit_adaption_debug) tprintf("word contains spaces\n"); return FALSE; } // if (flags.bit (CHECK_AMBIG_WERD) && test_ambig_word (word)) if (flags.bit (CHECK_AMBIG_WERD) && !getDict().NoDangerousAmbig(word->best_choice, NULL, false, NULL, NULL)) { if (tessedit_adaption_debug) tprintf("word is ambiguous\n"); return FALSE; } // Do not adapt to words that are composed from fragments if // tessedit_adapt_to_char_fragments is false. if (!tessedit_adapt_to_char_fragments) { const char *fragment_lengths = word->best_choice->fragment_lengths(); if (fragment_lengths != NULL && *fragment_lengths != '\0') { for (int i = 0; i < word->best_choice->length(); ++i) { if (fragment_lengths[i] > 1) { if (tessedit_adaption_debug) tprintf("won't adapt to fragments\n"); return false; // found a character composed from fragments } } } } if (tessedit_adaption_debug) { tprintf("returning status %d\n", status); } return status; }
/************************************************************************* * make_reject_map() * * Sets the done flag to indicate whether the resylt is acceptable. * * Sets a reject map for the word. *************************************************************************/ void Tesseract::make_reject_map(WERD_RES *word, ROW *row, inT16 pass) { int i; int offset; flip_0O(word); check_debug_pt(word, -1); // For trap only set_done(word, pass); // Set acceptance word->reject_map.initialise(word->best_choice->unichar_lengths().length()); reject_blanks(word); /* 0: Rays original heuristic - the baseline */ if (tessedit_reject_mode == 0) { if (!word->done) reject_poor_matches(word); } else if (tessedit_reject_mode == 5) { /* 5: Reject I/1/l from words where there is no strong contextual confirmation; the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls); and the whole of any words which are very small */ if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) { word->reject_map.rej_word_small_xht(); } else { one_ell_conflict(word, TRUE); /* Originally the code here just used the done flag. Now I have duplicated and unpacked the conditions for setting the done flag so that each mechanism can be turned on or off independently. This works WITHOUT affecting the done flag setting. */ if (rej_use_tess_accepted && !word->tess_accepted) word->reject_map.rej_word_not_tess_accepted (); if (rej_use_tess_blanks && (strchr (word->best_choice->unichar_string().string (), ' ') != NULL)) word->reject_map.rej_word_contains_blanks (); WERD_CHOICE* best_choice = word->best_choice; if (rej_use_good_perm) { if ((best_choice->permuter() == SYSTEM_DAWG_PERM || best_choice->permuter() == FREQ_DAWG_PERM || best_choice->permuter() == USER_DAWG_PERM) && (!rej_use_sensible_wd || acceptable_word_string(*word->uch_set, best_choice->unichar_string().string(), best_choice->unichar_lengths().string()) != AC_UNACCEPTABLE)) { // PASSED TEST } else if (best_choice->permuter() == NUMBER_PERM) { if (rej_alphas_in_number_perm) { for (i = 0, offset = 0; best_choice->unichar_string()[offset] != '\0'; offset += best_choice->unichar_lengths()[i++]) { if (word->reject_map[i].accepted() && word->uch_set->get_isalpha( best_choice->unichar_string().string() + offset, best_choice->unichar_lengths()[i])) word->reject_map[i].setrej_bad_permuter(); // rej alpha } } } else { word->reject_map.rej_word_bad_permuter(); } } /* Ambig word rejection was here once !!*/ } } else { tprintf("BAD tessedit_reject_mode\n"); err_exit(); } if (tessedit_image_border > -1) reject_edge_blobs(word); check_debug_pt (word, 10); if (tessedit_rejection_debug) { tprintf("Permuter Type = %d\n", word->best_choice->permuter ()); tprintf("Certainty: %f Rating: %f\n", word->best_choice->certainty (), word->best_choice->rating ()); tprintf("Dict word: %d\n", dict_word(*(word->best_choice))); } flip_hyphens(word); check_debug_pt(word, 20); }