inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list) { WERD_RES_IT word_it(&word_res_list); WERD_RES *word; PBLOB_IT blob_it; inT16 word_length; inT16 score = 0; inT16 i; inT16 offset; const char *chs; float small_limit = bln_x_height * fixsp_small_outlines_size; if (!fixsp_fp_eval) return (eval_word_spacing (word_res_list)); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); word_length = word->reject_map.length (); chs = word->best_choice->string ().string (); if ((word->done || word->tess_accepted) || (word->best_choice->permuter () == SYSTEM_DAWG_PERM) || (word->best_choice->permuter () == FREQ_DAWG_PERM) || (word->best_choice->permuter () == USER_DAWG_PERM) || (safe_dict_word (chs) > 0)) { blob_it.set_to_list (word->outword->blob_list ()); for (i = 0, offset = 0; i < word_length; offset += word->best_choice->lengths()[i++], blob_it.forward ()) { if ((chs[offset] == ' ') || (blob_noise_score (blob_it.data ()) < small_limit)) score -= 1; //penalise possibly erroneous non-space else if (word->reject_map[i].accepted ()) score++; } } } if (score < 0) score = 0; return score; }
inT16 Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) { WERD_RES_IT word_it(&word_res_list); WERD_RES *word; inT16 word_length; inT16 score = 0; inT16 i; float small_limit = kBlnXHeight * fixsp_small_outlines_size; for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { word = word_it.data(); if (word->rebuild_word == NULL) continue; // Can't handle cube words. word_length = word->reject_map.length(); if (word->done || word->tess_accepted || word->best_choice->permuter() == SYSTEM_DAWG_PERM || word->best_choice->permuter() == FREQ_DAWG_PERM || word->best_choice->permuter() == USER_DAWG_PERM || safe_dict_word(word) > 0) { TBLOB* blob = word->rebuild_word->blobs; UNICHAR_ID space = word->uch_set->unichar_to_id(" "); for (i = 0; i < word->best_choice->length() && blob != NULL; ++i, blob = blob->next) { if (word->best_choice->unichar_id(i) == space || blob_noise_score(blob) < small_limit) { score -= 1; // penalise possibly erroneous non-space } else if (word->reject_map[i].accepted()) { score++; } } } } if (score < 0) score = 0; return score; }
/************************************************************************* * SUSPECT LEVELS * * 0 - dont reject ANYTHING * 1,2 - partial rejection * 3 - BEST * * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and * tessedit_minimal_rejection. *************************************************************************/ void Tesseract::set_unlv_suspects(WERD_RES * word_res) { int len = word_res->reject_map.length(); const WERD_CHOICE &word = *(word_res->best_choice); const UNICHARSET &uchset = *word.unicharset(); int i; float rating_per_ch; if (suspect_level == 0) { for (i = 0; i < len; i++) { if (word_res->reject_map[i].rejected()) word_res->reject_map[i].setrej_minimal_rej_accept(); } return; } if (suspect_level >= 3) return; //Use defaults /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/ if (safe_dict_word(word_res) && (count_alphas(word) > suspect_short_words)) { /* Unreject alphas in dictionary words */ for (i = 0; i < len; ++i) { if (word_res->reject_map[i].rejected() && uchset.get_isalpha(word.unichar_id(i))) word_res->reject_map[i].setrej_minimal_rej_accept(); } } rating_per_ch = word.rating() / word_res->reject_map.length(); if (rating_per_ch >= suspect_rating_per_ch) return; //Dont touch bad ratings if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) { /* Unreject any Tess Acceptable word - but NOT tess reject chs*/ for (i = 0; i < len; ++i) { if (word_res->reject_map[i].rejected() && (!uchset.eq(word.unichar_id(i), " "))) word_res->reject_map[i].setrej_minimal_rej_accept(); } } for (i = 0; i < len; i++) { if (word_res->reject_map[i].rejected()) { if (word_res->reject_map[i].flag(R_DOC_REJ)) word_res->reject_map[i].setrej_minimal_rej_accept(); if (word_res->reject_map[i].flag(R_BLOCK_REJ)) word_res->reject_map[i].setrej_minimal_rej_accept(); if (word_res->reject_map[i].flag(R_ROW_REJ)) word_res->reject_map[i].setrej_minimal_rej_accept(); } } if (suspect_level == 2) return; if (!suspect_constrain_1Il || (word_res->reject_map.length() <= suspect_short_words)) { for (i = 0; i < len; i++) { if (word_res->reject_map[i].rejected()) { if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) || word_res->reject_map[i].flag(R_POSTNN_1IL))) word_res->reject_map[i].setrej_minimal_rej_accept(); if (!suspect_constrain_1Il && word_res->reject_map[i].flag(R_MM_REJECT)) word_res->reject_map[i].setrej_minimal_rej_accept(); } } } if (acceptable_word_string(*word_res->uch_set, word.unichar_string().string(), word.unichar_lengths().string()) != AC_UNACCEPTABLE || acceptable_number_string(word.unichar_string().string(), word.unichar_lengths().string())) { if (word_res->reject_map.length() > suspect_short_words) { for (i = 0; i < len; i++) { if (word_res->reject_map[i].rejected() && (!word_res->reject_map[i].perm_rejected() || word_res->reject_map[i].flag(R_1IL_CONFLICT) || word_res->reject_map[i].flag(R_POSTNN_1IL) || word_res->reject_map[i].flag(R_MM_REJECT))) { word_res->reject_map[i].setrej_minimal_rej_accept(); } } } } }
/********************************************************************** * one_ell_conflict() * * Identify words where there is a potential I/l/1 error. * - A bundle of contextual heuristics! **********************************************************************/ BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) { const char *word; const char *lengths; inT16 word_len; //its length inT16 first_alphanum_index_; inT16 first_alphanum_offset_; inT16 i; inT16 offset; BOOL8 non_conflict_set_char; //non conf set a/n? BOOL8 conflict = FALSE; BOOL8 allow_1s; ACCEPTABLE_WERD_TYPE word_type; BOOL8 dict_perm_type; BOOL8 dict_word_ok; int dict_word_type; word = word_res->best_choice->unichar_string().string (); lengths = word_res->best_choice->unichar_lengths().string(); word_len = strlen (lengths); /* If there are no occurrences of the conflict set characters then the word is OK. */ if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL) return FALSE; /* There is a conflict if there are NO other (confirmed) alphanumerics apart from those in the conflict set. */ for (i = 0, offset = 0, non_conflict_set_char = FALSE; (i < word_len) && !non_conflict_set_char; offset += lengths[i++]) non_conflict_set_char = (word_res->uch_set->get_isalpha(word + offset, lengths[i]) || word_res->uch_set->get_isdigit(word + offset, lengths[i])) && !STRING (conflict_set_I_l_1).contains (word[offset]); if (!non_conflict_set_char) { if (update_map) reject_I_1_L(word_res); return TRUE; } /* If the word is accepted by a dawg permuter, and the first alpha character is "I" or "l", check to see if the alternative is also a dawg word. If it is, then there is a potential error otherwise the word is ok. */ dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) || (word_res->best_choice->permuter () == USER_DAWG_PERM) || (rej_trust_doc_dawg && (word_res->best_choice->permuter () == DOC_DAWG_PERM)) || (word_res->best_choice->permuter () == FREQ_DAWG_PERM); dict_word_type = dict_word(*(word_res->best_choice)); dict_word_ok = (dict_word_type > 0) && (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM)); if ((rej_1Il_use_dict_word && dict_word_ok) || (rej_1Il_trust_permuter_type && dict_perm_type) || (dict_perm_type && dict_word_ok)) { first_alphanum_index_ = first_alphanum_index (word, lengths); first_alphanum_offset_ = first_alphanum_offset (word, lengths); if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; if (safe_dict_word(word_res) > 0) { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; if (update_map) word_res->reject_map[first_alphanum_index_]. setrej_1Il_conflict(); return TRUE; } else { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; return FALSE; } } if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; if (safe_dict_word(word_res) > 0) { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; if (update_map) word_res->reject_map[first_alphanum_index_]. setrej_1Il_conflict(); return TRUE; } else { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; return FALSE; } } return FALSE; } /* NEW 1Il code. The old code relied on permuter types too much. In fact, tess will use TOP_CHOICE permute for good things like "palette". In this code the string is examined independently to see if it looks like a well formed word. */ /* REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a dictionary word. */ first_alphanum_index_ = first_alphanum_index (word, lengths); first_alphanum_offset_ = first_alphanum_offset (word, lengths); if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; if (safe_dict_word(word_res) > 0) return FALSE; else word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; } else if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; if (safe_dict_word(word_res) > 0) return FALSE; else word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; } /* For strings containing digits: If there are no alphas OR the numeric permuter liked the word, reject any non 1 conflict chs Else reject all conflict chs */ if (word_contains_non_1_digit (word, lengths)) { allow_1s = (alpha_count (word, lengths) == 0) || (word_res->best_choice->permuter () == NUMBER_PERM); inT16 offset; conflict = FALSE; for (i = 0, offset = 0; word[offset] != '\0'; offset += word_res->best_choice->unichar_lengths()[i++]) { if ((!allow_1s || (word[offset] != '1')) && STRING (conflict_set_I_l_1).contains (word[offset])) { if (update_map) word_res->reject_map[i].setrej_1Il_conflict (); conflict = TRUE; } } return conflict; } /* For anything else. See if it conforms to an acceptable word type. If so, treat accordingly. */ word_type = acceptable_word_string(*word_res->uch_set, word, lengths); if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) { first_alphanum_index_ = first_alphanum_index (word, lengths); first_alphanum_offset_ = first_alphanum_offset (word, lengths); if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) { if (update_map) word_res->reject_map[first_alphanum_index_]. setrej_1Il_conflict (); return TRUE; } else return FALSE; } else if (word_type == AC_UPPER_CASE) { return FALSE; } else { if (update_map) reject_I_1_L(word_res); return TRUE; } }