void Tesseract::recog_word(WERD_RES *word) { if (wordrec_skip_no_truth_words && (word->blamer_bundle == NULL || word->blamer_bundle->incorrect_result_reason() == IRR_NO_TRUTH)) { if (classify_debug_level) tprintf("No truth for word - skipping\n"); word->tess_failed = true; return; } ASSERT_HOST(!word->chopped_word->blobs.empty()); recog_word_recursive(word); word->SetupBoxWord(); if (word->best_choice->length() != word->box_word->length()) { tprintf("recog_word ASSERT FAIL String:\"%s\"; " "Strlen=%d; #Blobs=%d\n", word->best_choice->debug_string().string(), word->best_choice->length(), word->box_word->length()); } ASSERT_HOST(word->best_choice->length() == word->box_word->length()); // Check that the ratings matrix size matches the sum of all the // segmentation states. if (!word->StatesAllValid()) { tprintf("Not all words have valid states relative to ratings matrix!!"); word->DebugWordChoices(true, NULL); ASSERT_HOST(word->StatesAllValid()); } if (tessedit_override_permuter) { /* Override the permuter type if a straight dictionary check disagrees. */ uinT8 perm_type = word->best_choice->permuter(); if ((perm_type != SYSTEM_DAWG_PERM) && (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) { uinT8 real_dict_perm_type = dict_word(*word->best_choice); if (((real_dict_perm_type == SYSTEM_DAWG_PERM) || (real_dict_perm_type == FREQ_DAWG_PERM) || (real_dict_perm_type == USER_DAWG_PERM)) && (alpha_count(word->best_choice->unichar_string().string(), word->best_choice->unichar_lengths().string()) > 0)) { word->best_choice->set_permuter(real_dict_perm_type); // use dict perm } } if (tessedit_rejection_debug && perm_type != word->best_choice->permuter()) { tprintf("Permuter Type Flipped from %d to %d\n", perm_type, word->best_choice->permuter()); } } // Factored out from control.cpp ASSERT_HOST((word->best_choice == NULL) == (word->raw_choice == NULL)); if (word->best_choice == NULL || word->best_choice->length() == 0 || static_cast<int>(strspn(word->best_choice->unichar_string().string(), " ")) == word->best_choice->length()) { word->tess_failed = true; word->reject_map.initialise(word->box_word->length()); word->reject_map.rej_word_tess_failure(); } else { word->tess_failed = false; } }
/************************************************************************* * write_results() * * All recognition and rejection has now been done. Generate the following: * .txt file - giving the final best choices with NO highlighting * .raw file - giving the tesseract top choice output for each word * .map file - showing how the .txt file has been rejected in the .ep file * epchoice list - a list of one element per word, containing the text for the * epaper. Reject strings are inserted. * inset list - a list of bounding boxes of reject insets - indexed by the * reject strings in the epchoice text. *************************************************************************/ void Tesseract::write_results(PAGE_RES_IT &page_res_it, char newline_type, // type of newline BOOL8 force_eol) { // override tilde crunch? WERD_RES *word = page_res_it.word(); const UNICHARSET &uchset = *word->uch_set; int i; BOOL8 need_reject = FALSE; UNICHAR_ID space = uchset.unichar_to_id(" "); if ((word->unlv_crunch_mode != CR_NONE || word->best_choice->length() == 0) && !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) { if ((word->unlv_crunch_mode != CR_DELETE) && (!stats_.tilde_crunch_written || ((word->unlv_crunch_mode == CR_KEEP_SPACE) && (word->word->space() > 0) && !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) { if (!word->word->flag(W_BOL) && (word->word->space() > 0) && !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)) { stats_.last_char_was_tilde = false; } need_reject = TRUE; } if ((need_reject && !stats_.last_char_was_tilde) || (force_eol && stats_.write_results_empty_block)) { /* Write a reject char - mark as rejected unless zero_rejection mode */ stats_.last_char_was_tilde = TRUE; stats_.tilde_crunch_written = true; stats_.last_char_was_newline = false; stats_.write_results_empty_block = false; } if ((word->word->flag(W_EOL) && !stats_.last_char_was_newline) || force_eol) { stats_.tilde_crunch_written = false; stats_.last_char_was_newline = true; stats_.last_char_was_tilde = false; } if (force_eol) stats_.write_results_empty_block = true; return; } /* NORMAL PROCESSING of non tilde crunched words */ stats_.tilde_crunch_written = false; if (newline_type) stats_.last_char_was_newline = true; else stats_.last_char_was_newline = false; stats_.write_results_empty_block = force_eol; // about to write a real word if (unlv_tilde_crunching && stats_.last_char_was_tilde && (word->word->space() == 0) && !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) && (word->best_choice->unichar_id(0) == space)) { /* Prevent adjacent tilde across words - we know that adjacent tildes within words have been removed */ word->MergeAdjacentBlobs(0); } if (newline_type || (word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes)) stats_.last_char_was_tilde = false; else { if (word->reject_map.length() > 0) { if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) stats_.last_char_was_tilde = true; else stats_.last_char_was_tilde = false; } else if (word->word->space() > 0) stats_.last_char_was_tilde = false; /* else it is unchanged as there are no output chars */ } ASSERT_HOST(word->best_choice->length() == word->reject_map.length()); set_unlv_suspects(word); check_debug_pt(word, 120); if (tessedit_rejection_debug) { tprintf("Dict word: \"%s\": %d\n", word->best_choice->debug_string().string(), dict_word(*(word->best_choice))); } if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) { if (tessedit_zero_rejection) { /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ for (i = 0; i < word->best_choice->length(); ++i) { if (word->reject_map[i].rejected()) word->reject_map[i].setrej_minimal_rej_accept(); } } if (tessedit_minimal_rejection) { /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ for (i = 0; i < word->best_choice->length(); ++i) { if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected()) word->reject_map[i].setrej_minimal_rej_accept(); } } } }
/********************************************************************** * recog_word * * Convert the word to tess form and pass it to the tess segmenter. * Convert the output back to editor form. **********************************************************************/ WERD_CHOICE *recog_word( //recog one owrd WERD *word, //word to do DENORM *denorm, //de-normaliser POLY_MATCHER matcher, //matcher function POLY_TESTER tester, //tester function POLY_TESTER trainer, //trainer function BOOL8 testing, //true if answer driven WERD_CHOICE *&raw_choice, //raw result //list of blob lists BLOB_CHOICE_LIST_CLIST *blob_choices, WERD *&outword //bln word output ) { WERD_CHOICE *word_choice; uinT8 perm_type; uinT8 real_dict_perm_type; if (word->blob_list ()->empty ()) { char empty_lengths[] = {0}; word_choice = new WERD_CHOICE ("", empty_lengths, 10.0f, -1.0f, TOP_CHOICE_PERM); raw_choice = new WERD_CHOICE ("", empty_lengths, 10.0f, -1.0f, TOP_CHOICE_PERM); outword = word->poly_copy (denorm->row ()->x_height ()); } else word_choice = recog_word_recursive (word, denorm, matcher, tester, trainer, testing, raw_choice, blob_choices, outword); if ((word_choice->lengths ().length () != outword->blob_list ()->length ()) || (word_choice->lengths ().length () != blob_choices->length ())) { tprintf ("recog_word ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n", word_choice->string ().string (), word_choice->lengths ().length (), outword->blob_list ()->length (), blob_choices->length ()); } ASSERT_HOST (word_choice->lengths ().length () == outword->blob_list ()->length ()); ASSERT_HOST (word_choice->lengths ().length () == blob_choices->length ()); /* Copy any reject blobs into the outword */ outword->rej_blob_list ()->deep_copy (word->rej_blob_list ()); if (tessedit_override_permuter) { /* Override the permuter type if a straight dictionary check disagrees. */ perm_type = word_choice->permuter (); if ((perm_type != SYSTEM_DAWG_PERM) && (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) { real_dict_perm_type = dict_word (word_choice->string ().string ()); if (((real_dict_perm_type == SYSTEM_DAWG_PERM) || (real_dict_perm_type == FREQ_DAWG_PERM) || (real_dict_perm_type == USER_DAWG_PERM)) && (alpha_count (word_choice->string ().string (), word_choice->lengths ().string ()) > 0)) word_choice->set_permuter (real_dict_perm_type); //Use dict perm } if (tessedit_rejection_debug && perm_type != word_choice->permuter ()) { tprintf ("Permuter Type Flipped from %d to %d\n", perm_type, word_choice->permuter ()); } } assert ((word_choice == NULL) == (raw_choice == NULL)); return word_choice; }
int64_t decompress(const char* in_filename, const char* out_filename, uint8_t flags) { struct bitio *bd = bstdin; struct dictionary *d = NULL; struct utimbuf *t = NULL; FILE* fout = stdout; char *out_file = NULL; uint8_t bits, initial_bits, meta_type, meta_size; uint16_t c; uint32_t bitMask, cur, first_record, len, next_record, dict_size = 0, written = 0, write_count = 0; uint64_t filesize = 0; char *word; int first = 1, md5c_size = 0, md5d_size = 0; void *meta_data, *md5c = NULL, *md5d = NULL; EVP_MD_CTX *md_ctx = NULL; if (in_filename != NULL) { bd = bitio_open(in_filename, 'r'); if (bd == NULL) goto error; } //read metadata while ((meta_data = meta_read(bd, &meta_type, &meta_size)) != META_END) { LOG("META_TYPE: %d", meta_type); switch (meta_type) { case META_DICT_SIZE: dict_size = *(uint32_t*)meta_data; PRINT(1, "Dictionary Size:\t%d\n", dict_size); break; case META_NAME: PRINT(1, "Original file name:\t%s\n", (char*)meta_data); if (flags & DEC_ORIG_FILENAME) { out_file = malloc(meta_size); if (out_file == NULL) goto error; memcpy((void*)out_file, meta_data, meta_size); out_filename = out_file; } break; case META_MD5: md5c = malloc(meta_size); memcpy(md5c, meta_data, meta_size); md5c_size = meta_size; word = sprinth(md5c, md5c_size); PRINT(1, "Original md5sum:\t%s\n", word); free(word); // initialize md context OpenSSL_add_all_digests(); md_ctx = malloc(sizeof(EVP_MD_CTX)); EVP_MD_CTX_init(md_ctx); EVP_DigestInit(md_ctx, EVP_get_digestbyname("md5")); md5d_size = EVP_MD_CTX_size(md_ctx); md5d = malloc(md5d_size); break; case META_TIMESTAMP: t = malloc(sizeof(*t)); t->actime = *((time_t*)meta_data); // access time t->modtime = *((time_t*)meta_data); // modification time break; default: // META_ERROR LOG("Unknown metadata"); errno = EINVAL; goto error; } free(meta_data); } if ((flags & DEC_ORIG_FILENAME) && out_file == NULL) // if i have DEC_ORIG_FILENAME setted but no info in metadata i use stdin as outfile out_filename = "stdin"; if (out_filename != NULL) { fout = fopen(out_filename, "w"); if (fout == NULL) goto error; } if (out_filename != NULL && in_filename != NULL && strcmp(in_filename, out_filename) == 0) { errno = EINVAL; goto error; } if (dict_size == 0) goto error; d = dict_new(dict_size, 0, dict_size, NUM_SYMBOLS); if (d == NULL) goto error; first_record = dict_init(d); next_record = first_record; initial_bits = 0; bitMask = 1; while (bitMask < next_record) { bitMask <<= 1; initial_bits++; } bits = initial_bits; for (;;) { // put in cur the index of the fetched word in the dictionary cur = fetch(bd, bits); if (cur == ROOT_NODE) goto error; if (cur == EOF_SYMBOL) break; c = dict_first_symbol(d, cur); if (c == EOF_SYMBOL) goto error; if (!first) { // complete previous record with index of new record // ROOT_NODE as current node value means 'don't change it'. dict_fill(d, next_record, ROOT_NODE, (uint8_t) c, 0); next_record++; if ((next_record+1) & bitMask) { bitMask <<= 1; bits++; } } else first = 0; // get the word in the dictionary at index cur. word = dict_word(d, cur, &len); if (word == NULL) goto error; written = fwrite(word, 1, len, fout); if (written < len) goto error; else { // md5 computation and visual feedback if (md5c != NULL) // compute md5 of decompressed EVP_DigestUpdate(md_ctx, word, len); write_count += written; if (write_count >= COUNT_THRESHOLD) { filesize += write_count; write_count = 0; PRINT(1, "."); } } if (next_record + 1 == dict_size) { next_record = first_record; bits = initial_bits; bitMask = 1 << bits; first = 1; // set first iteration to be the next } // add a new record dict_fill(d, next_record, cur, 0, 0); // symbol will be filled at the beginning of next iteration } filesize += write_count; if (md5c != NULL) { EVP_DigestFinal_ex(md_ctx, md5d, (unsigned int*)&md5d_size); if (md5c_size == md5d_size && memcmp(md5c, md5d, md5c_size) == 0) PRINT(1, "\nmd5sum Check:\t\tOK"); else { PRINT(1, "\nmd5sum Check:\t\tFailed"); goto error; } } PRINT(1, "\nDecompression Finished\n\n"); fclose(fout); if (out_file != NULL && t != NULL) if (utime(out_filename, t) < 0) { // set modification time PRINT(1, "Error while changing last modification time"); } free(out_file); free(t); dict_delete(d); bitio_flush(bd); if (bd != bstdin) bitio_close(bd); return filesize; error: PRINT(1, "\n"); if (out_filename != NULL) unlink(out_filename); free(out_file); free(t); dict_delete(d); bitio_flush(bd); if (bd != bstdin) bitio_close(bd); if (fout != NULL) fclose(fout); return -1; }
/************************************************************************* * write_results() * * All recognition and rejection has now been done. Generate the following: * .txt file - giving the final best choices with NO highlighting * .raw file - giving the tesseract top choice output for each word * .map file - showing how the .txt file has been rejected in the .ep file * epchoice list - a list of one element per word, containing the text for the * epaper. Reject strings are inserted. * inset list - a list of bounding boxes of reject insets - indexed by the * reject strings in the epchoice text. *************************************************************************/ void Tesseract::write_results(PAGE_RES_IT &page_res_it, char newline_type, // type of newline BOOL8 force_eol) { // override tilde crunch? WERD_RES *word = page_res_it.word(); const UNICHARSET &uchset = *word->uch_set; STRING repetition_code; const STRING *wordstr; STRING wordstr_lengths; int i; char unrecognised = STRING (unrecognised_char)[0]; char ep_chars[32]; //Only for unlv_tilde_crunch int ep_chars_index = 0; char txt_chs[32]; //Only for unlv_tilde_crunch char map_chs[32]; //Only for unlv_tilde_crunch int txt_index = 0; BOOL8 need_reject = FALSE; UNICHAR_ID space = uchset.unichar_to_id(" "); if ((word->unlv_crunch_mode != CR_NONE || word->best_choice->length() == 0) && !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) { if ((word->unlv_crunch_mode != CR_DELETE) && (!stats_.tilde_crunch_written || ((word->unlv_crunch_mode == CR_KEEP_SPACE) && (word->word->space () > 0) && !word->word->flag (W_FUZZY_NON) && !word->word->flag (W_FUZZY_SP)))) { if (!word->word->flag (W_BOL) && (word->word->space () > 0) && !word->word->flag (W_FUZZY_NON) && !word->word->flag (W_FUZZY_SP)) { // Write a space to separate from preceeding good text. txt_chs[txt_index] = ' '; map_chs[txt_index++] = '1'; ep_chars[ep_chars_index++] = ' '; stats_.last_char_was_tilde = false; } need_reject = TRUE; } if ((need_reject && !stats_.last_char_was_tilde) || (force_eol && stats_.write_results_empty_block)) { /* Write a reject char - mark as rejected unless zero_rejection mode */ stats_.last_char_was_tilde = TRUE; txt_chs[txt_index] = unrecognised; if (tessedit_zero_rejection || (suspect_level == 0)) { map_chs[txt_index++] = '1'; ep_chars[ep_chars_index++] = unrecognised; } else { map_chs[txt_index++] = '0'; /* The ep_choice string is a faked reject to allow newdiff to sync the .etx with the .txt and .map files. */ ep_chars[ep_chars_index++] = CTRL_INSET; // escape code //dummy reject ep_chars[ep_chars_index++] = 1; //dummy reject ep_chars[ep_chars_index++] = 1; //type ep_chars[ep_chars_index++] = 2; //dummy reject ep_chars[ep_chars_index++] = 1; //dummy reject ep_chars[ep_chars_index++] = 1; } stats_.tilde_crunch_written = true; stats_.last_char_was_newline = false; stats_.write_results_empty_block = false; } if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) { /* Add a new line output */ txt_chs[txt_index] = '\n'; map_chs[txt_index++] = '\n'; //end line ep_chars[ep_chars_index++] = newline_type; //Cos of the real newline stats_.tilde_crunch_written = false; stats_.last_char_was_newline = true; stats_.last_char_was_tilde = false; } txt_chs[txt_index] = '\0'; map_chs[txt_index] = '\0'; ep_chars[ep_chars_index] = '\0'; // terminate string word->ep_choice = new WERD_CHOICE(ep_chars, uchset); if (force_eol) stats_.write_results_empty_block = true; return; } /* NORMAL PROCESSING of non tilde crunched words */ stats_.tilde_crunch_written = false; if (newline_type) stats_.last_char_was_newline = true; else stats_.last_char_was_newline = false; stats_.write_results_empty_block = force_eol; // about to write a real word if (unlv_tilde_crunching && stats_.last_char_was_tilde && (word->word->space() == 0) && !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) && (word->best_choice->unichar_id(0) == space)) { /* Prevent adjacent tilde across words - we know that adjacent tildes within words have been removed */ word->best_choice->remove_unichar_id(0); if (word->best_choice->blob_choices() != NULL) { BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices()); if (!blob_choices_it.empty()) delete blob_choices_it.extract(); } word->reject_map.remove_pos (0); word->box_word->DeleteBox(0); } if (newline_type || (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) stats_.last_char_was_tilde = false; else { if (word->reject_map.length () > 0) { if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) stats_.last_char_was_tilde = true; else stats_.last_char_was_tilde = false; } else if (word->word->space () > 0) stats_.last_char_was_tilde = false; /* else it is unchanged as there are no output chars */ } ASSERT_HOST (word->best_choice->length() == word->reject_map.length()); set_unlv_suspects(word); check_debug_pt (word, 120); if (tessedit_rejection_debug) { tprintf ("Dict word: \"%s\": %d\n", word->best_choice->debug_string().string(), dict_word(*(word->best_choice))); } if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) { repetition_code = "|^~R"; wordstr_lengths = "\001\001\001\001"; repetition_code += uchset.id_to_unichar(get_rep_char(word)); wordstr_lengths += strlen(uchset.id_to_unichar(get_rep_char(word))); wordstr = &repetition_code; } else { if (tessedit_zero_rejection) { /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ for (i = 0; i < word->best_choice->length(); ++i) { if (word->reject_map[i].rejected()) word->reject_map[i].setrej_minimal_rej_accept(); } } if (tessedit_minimal_rejection) { /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ for (i = 0; i < word->best_choice->length(); ++i) { if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected()) word->reject_map[i].setrej_minimal_rej_accept(); } } } }
/************************************************************************* * make_reject_map() * * Sets the done flag to indicate whether the resylt is acceptable. * * Sets a reject map for the word. *************************************************************************/ void Tesseract::make_reject_map(WERD_RES *word, ROW *row, inT16 pass) { int i; int offset; flip_0O(word); check_debug_pt(word, -1); // For trap only set_done(word, pass); // Set acceptance word->reject_map.initialise(word->best_choice->unichar_lengths().length()); reject_blanks(word); /* 0: Rays original heuristic - the baseline */ if (tessedit_reject_mode == 0) { if (!word->done) reject_poor_matches(word); } else if (tessedit_reject_mode == 5) { /* 5: Reject I/1/l from words where there is no strong contextual confirmation; the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls); and the whole of any words which are very small */ if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) { word->reject_map.rej_word_small_xht(); } else { one_ell_conflict(word, TRUE); /* Originally the code here just used the done flag. Now I have duplicated and unpacked the conditions for setting the done flag so that each mechanism can be turned on or off independently. This works WITHOUT affecting the done flag setting. */ if (rej_use_tess_accepted && !word->tess_accepted) word->reject_map.rej_word_not_tess_accepted (); if (rej_use_tess_blanks && (strchr (word->best_choice->unichar_string().string (), ' ') != NULL)) word->reject_map.rej_word_contains_blanks (); WERD_CHOICE* best_choice = word->best_choice; if (rej_use_good_perm) { if ((best_choice->permuter() == SYSTEM_DAWG_PERM || best_choice->permuter() == FREQ_DAWG_PERM || best_choice->permuter() == USER_DAWG_PERM) && (!rej_use_sensible_wd || acceptable_word_string(*word->uch_set, best_choice->unichar_string().string(), best_choice->unichar_lengths().string()) != AC_UNACCEPTABLE)) { // PASSED TEST } else if (best_choice->permuter() == NUMBER_PERM) { if (rej_alphas_in_number_perm) { for (i = 0, offset = 0; best_choice->unichar_string()[offset] != '\0'; offset += best_choice->unichar_lengths()[i++]) { if (word->reject_map[i].accepted() && word->uch_set->get_isalpha( best_choice->unichar_string().string() + offset, best_choice->unichar_lengths()[i])) word->reject_map[i].setrej_bad_permuter(); // rej alpha } } } else { word->reject_map.rej_word_bad_permuter(); } } /* Ambig word rejection was here once !!*/ } } else { tprintf("BAD tessedit_reject_mode\n"); err_exit(); } if (tessedit_image_border > -1) reject_edge_blobs(word); check_debug_pt (word, 10); if (tessedit_rejection_debug) { tprintf("Permuter Type = %d\n", word->best_choice->permuter ()); tprintf("Certainty: %f Rating: %f\n", word->best_choice->certainty (), word->best_choice->rating ()); tprintf("Dict word: %d\n", dict_word(*(word->best_choice))); } flip_hyphens(word); check_debug_pt(word, 20); }
/********************************************************************** * one_ell_conflict() * * Identify words where there is a potential I/l/1 error. * - A bundle of contextual heuristics! **********************************************************************/ BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) { const char *word; const char *lengths; inT16 word_len; //its length inT16 first_alphanum_index_; inT16 first_alphanum_offset_; inT16 i; inT16 offset; BOOL8 non_conflict_set_char; //non conf set a/n? BOOL8 conflict = FALSE; BOOL8 allow_1s; ACCEPTABLE_WERD_TYPE word_type; BOOL8 dict_perm_type; BOOL8 dict_word_ok; int dict_word_type; word = word_res->best_choice->unichar_string().string (); lengths = word_res->best_choice->unichar_lengths().string(); word_len = strlen (lengths); /* If there are no occurrences of the conflict set characters then the word is OK. */ if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL) return FALSE; /* There is a conflict if there are NO other (confirmed) alphanumerics apart from those in the conflict set. */ for (i = 0, offset = 0, non_conflict_set_char = FALSE; (i < word_len) && !non_conflict_set_char; offset += lengths[i++]) non_conflict_set_char = (word_res->uch_set->get_isalpha(word + offset, lengths[i]) || word_res->uch_set->get_isdigit(word + offset, lengths[i])) && !STRING (conflict_set_I_l_1).contains (word[offset]); if (!non_conflict_set_char) { if (update_map) reject_I_1_L(word_res); return TRUE; } /* If the word is accepted by a dawg permuter, and the first alpha character is "I" or "l", check to see if the alternative is also a dawg word. If it is, then there is a potential error otherwise the word is ok. */ dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) || (word_res->best_choice->permuter () == USER_DAWG_PERM) || (rej_trust_doc_dawg && (word_res->best_choice->permuter () == DOC_DAWG_PERM)) || (word_res->best_choice->permuter () == FREQ_DAWG_PERM); dict_word_type = dict_word(*(word_res->best_choice)); dict_word_ok = (dict_word_type > 0) && (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM)); if ((rej_1Il_use_dict_word && dict_word_ok) || (rej_1Il_trust_permuter_type && dict_perm_type) || (dict_perm_type && dict_word_ok)) { first_alphanum_index_ = first_alphanum_index (word, lengths); first_alphanum_offset_ = first_alphanum_offset (word, lengths); if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; if (safe_dict_word(word_res) > 0) { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; if (update_map) word_res->reject_map[first_alphanum_index_]. setrej_1Il_conflict(); return TRUE; } else { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; return FALSE; } } if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; if (safe_dict_word(word_res) > 0) { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; if (update_map) word_res->reject_map[first_alphanum_index_]. setrej_1Il_conflict(); return TRUE; } else { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; return FALSE; } } return FALSE; } /* NEW 1Il code. The old code relied on permuter types too much. In fact, tess will use TOP_CHOICE permute for good things like "palette". In this code the string is examined independently to see if it looks like a well formed word. */ /* REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a dictionary word. */ first_alphanum_index_ = first_alphanum_index (word, lengths); first_alphanum_offset_ = first_alphanum_offset (word, lengths); if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; if (safe_dict_word(word_res) > 0) return FALSE; else word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; } else if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') { word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l'; if (safe_dict_word(word_res) > 0) return FALSE; else word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I'; } /* For strings containing digits: If there are no alphas OR the numeric permuter liked the word, reject any non 1 conflict chs Else reject all conflict chs */ if (word_contains_non_1_digit (word, lengths)) { allow_1s = (alpha_count (word, lengths) == 0) || (word_res->best_choice->permuter () == NUMBER_PERM); inT16 offset; conflict = FALSE; for (i = 0, offset = 0; word[offset] != '\0'; offset += word_res->best_choice->unichar_lengths()[i++]) { if ((!allow_1s || (word[offset] != '1')) && STRING (conflict_set_I_l_1).contains (word[offset])) { if (update_map) word_res->reject_map[i].setrej_1Il_conflict (); conflict = TRUE; } } return conflict; } /* For anything else. See if it conforms to an acceptable word type. If so, treat accordingly. */ word_type = acceptable_word_string(*word_res->uch_set, word, lengths); if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) { first_alphanum_index_ = first_alphanum_index (word, lengths); first_alphanum_offset_ = first_alphanum_offset (word, lengths); if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) { if (update_map) word_res->reject_map[first_alphanum_index_]. setrej_1Il_conflict (); return TRUE; } else return FALSE; } else if (word_type == AC_UPPER_CASE) { return FALSE; } else { if (update_map) reject_I_1_L(word_res); return TRUE; } }