PAGE_RES_IT* make_pseudo_word(PAGE_RES* page_res, const TBOX& selection_box) { PAGE_RES_IT pr_it(page_res); C_BLOB_LIST new_blobs; // list of gathered blobs C_BLOB_IT new_blob_it = &new_blobs; // iterator for (WERD_RES* word_res = pr_it.word(); word_res != NULL; word_res = pr_it.forward()) { WERD* word = word_res->word; if (word->bounding_box().overlap(selection_box)) { C_BLOB_IT blob_it(word->cblob_list()); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* blob = blob_it.data(); if (blob->bounding_box().overlap(selection_box)) { new_blob_it.add_after_then_move(C_BLOB::deep_copy(blob)); } } if (!new_blobs.empty()) { WERD* pseudo_word = new WERD(&new_blobs, 1, NULL); word_res = pr_it.InsertSimpleCloneWord(*word_res, pseudo_word); PAGE_RES_IT* it = new PAGE_RES_IT(page_res); while (it->word() != word_res && it->word() != NULL) it->forward(); ASSERT_HOST(it->word() == word_res); return it; } } } return NULL; }
// page_res is non-const because the iterator doesn't know if you are going // to change the items it points to! Really a const here though. void Tesseract::blob_feature_display(PAGE_RES* page_res, const TBOX& selection_box) { PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box); if (it != NULL) { WERD_RES* word_res = it->word(); word_res->x_height = it->row()->row->x_height(); word_res->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, NULL, classify_bln_numeric_mode, textord_use_cjk_fp_model, poly_allow_detailed_fx, it->row()->row, it->block()->block); TWERD* bln_word = word_res->chopped_word; TBLOB* bln_blob = bln_word->blobs[0]; INT_FX_RESULT_STRUCT fx_info; GenericVector<INT_FEATURE_STRUCT> bl_features; GenericVector<INT_FEATURE_STRUCT> cn_features; Classify::ExtractFeatures(*bln_blob, classify_nonlinear_norm, &bl_features, &cn_features, &fx_info, NULL); // Display baseline features. ScrollView* bl_win = CreateFeatureSpaceWindow("BL Features", 512, 0); ClearFeatureSpaceWindow(baseline, bl_win); for (int f = 0; f < bl_features.size(); ++f) RenderIntFeature(bl_win, &bl_features[f], ScrollView::GREEN); bl_win->Update(); // Display cn features. ScrollView* cn_win = CreateFeatureSpaceWindow("CN Features", 512, 0); ClearFeatureSpaceWindow(character, cn_win); for (int f = 0; f < cn_features.size(); ++f) RenderIntFeature(cn_win, &cn_features[f], ScrollView::GREEN); cn_win->Update(); it->DeleteCurrentWord(); delete it; } }
void Tesseract::output_pass( //Tess output pass //send to api PAGE_RES_IT &page_res_it, const TBOX *target_word_box) { BLOCK_RES *block_of_last_word; BOOL8 force_eol; //During output BLOCK *nextblock; //block of next word WERD *nextword; //next word page_res_it.restart_page(); block_of_last_word = NULL; while (page_res_it.word() != NULL) { check_debug_pt(page_res_it.word(), 120); if (target_word_box) { TBOX current_word_box = page_res_it.word()->word->bounding_box(); FCOORD center_pt((current_word_box.right() + current_word_box.left()) / 2, (current_word_box.bottom() + current_word_box.top()) / 2); if (!target_word_box->contains(center_pt)) { page_res_it.forward(); continue; } } if (tessedit_write_block_separators && block_of_last_word != page_res_it.block()) { block_of_last_word = page_res_it.block(); } force_eol = (tessedit_write_block_separators && (page_res_it.block() != page_res_it.next_block())) || (page_res_it.next_word() == NULL); if (page_res_it.next_word() != NULL) nextword = page_res_it.next_word()->word; else nextword = NULL; if (page_res_it.next_block() != NULL) nextblock = page_res_it.next_block()->block; else nextblock = NULL; //regardless of tilde crunching write_results(page_res_it, determine_newline_type(page_res_it.word()->word, page_res_it.block()->block, nextword, nextblock), force_eol); page_res_it.forward(); } }
/************************************************************************* * write_results() * * All recognition and rejection has now been done. Generate the following: * .txt file - giving the final best choices with NO highlighting * .raw file - giving the tesseract top choice output for each word * .map file - showing how the .txt file has been rejected in the .ep file * epchoice list - a list of one element per word, containing the text for the * epaper. Reject strings are inserted. * inset list - a list of bounding boxes of reject insets - indexed by the * reject strings in the epchoice text. *************************************************************************/ void Tesseract::write_results(PAGE_RES_IT &page_res_it, char newline_type, // type of newline BOOL8 force_eol) { // override tilde crunch? WERD_RES *word = page_res_it.word(); const UNICHARSET &uchset = *word->uch_set; int i; BOOL8 need_reject = FALSE; UNICHAR_ID space = uchset.unichar_to_id(" "); if ((word->unlv_crunch_mode != CR_NONE || word->best_choice->length() == 0) && !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) { if ((word->unlv_crunch_mode != CR_DELETE) && (!stats_.tilde_crunch_written || ((word->unlv_crunch_mode == CR_KEEP_SPACE) && (word->word->space() > 0) && !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) { if (!word->word->flag(W_BOL) && (word->word->space() > 0) && !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)) { stats_.last_char_was_tilde = false; } need_reject = TRUE; } if ((need_reject && !stats_.last_char_was_tilde) || (force_eol && stats_.write_results_empty_block)) { /* Write a reject char - mark as rejected unless zero_rejection mode */ stats_.last_char_was_tilde = TRUE; stats_.tilde_crunch_written = true; stats_.last_char_was_newline = false; stats_.write_results_empty_block = false; } if ((word->word->flag(W_EOL) && !stats_.last_char_was_newline) || force_eol) { stats_.tilde_crunch_written = false; stats_.last_char_was_newline = true; stats_.last_char_was_tilde = false; } if (force_eol) stats_.write_results_empty_block = true; return; } /* NORMAL PROCESSING of non tilde crunched words */ stats_.tilde_crunch_written = false; if (newline_type) stats_.last_char_was_newline = true; else stats_.last_char_was_newline = false; stats_.write_results_empty_block = force_eol; // about to write a real word if (unlv_tilde_crunching && stats_.last_char_was_tilde && (word->word->space() == 0) && !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) && (word->best_choice->unichar_id(0) == space)) { /* Prevent adjacent tilde across words - we know that adjacent tildes within words have been removed */ word->MergeAdjacentBlobs(0); } if (newline_type || (word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes)) stats_.last_char_was_tilde = false; else { if (word->reject_map.length() > 0) { if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) stats_.last_char_was_tilde = true; else stats_.last_char_was_tilde = false; } else if (word->word->space() > 0) stats_.last_char_was_tilde = false; /* else it is unchanged as there are no output chars */ } ASSERT_HOST(word->best_choice->length() == word->reject_map.length()); set_unlv_suspects(word); check_debug_pt(word, 120); if (tessedit_rejection_debug) { tprintf("Dict word: \"%s\": %d\n", word->best_choice->debug_string().string(), dict_word(*(word->best_choice))); } if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) { if (tessedit_zero_rejection) { /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ for (i = 0; i < word->best_choice->length(); ++i) { if (word->reject_map[i].rejected()) word->reject_map[i].setrej_minimal_rej_accept(); } } if (tessedit_minimal_rejection) { /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ for (i = 0; i < word->best_choice->length(); ++i) { if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected()) word->reject_map[i].setrej_minimal_rej_accept(); } } } }
/************************************************************************* * write_results() * * All recognition and rejection has now been done. Generate the following: * .txt file - giving the final best choices with NO highlighting * .raw file - giving the tesseract top choice output for each word * .map file - showing how the .txt file has been rejected in the .ep file * epchoice list - a list of one element per word, containing the text for the * epaper. Reject strings are inserted. * inset list - a list of bounding boxes of reject insets - indexed by the * reject strings in the epchoice text. *************************************************************************/ void Tesseract::write_results(PAGE_RES_IT &page_res_it, char newline_type, // type of newline BOOL8 force_eol) { // override tilde crunch? WERD_RES *word = page_res_it.word(); const UNICHARSET &uchset = *word->uch_set; STRING repetition_code; const STRING *wordstr; STRING wordstr_lengths; int i; char unrecognised = STRING (unrecognised_char)[0]; char ep_chars[32]; //Only for unlv_tilde_crunch int ep_chars_index = 0; char txt_chs[32]; //Only for unlv_tilde_crunch char map_chs[32]; //Only for unlv_tilde_crunch int txt_index = 0; BOOL8 need_reject = FALSE; UNICHAR_ID space = uchset.unichar_to_id(" "); if ((word->unlv_crunch_mode != CR_NONE || word->best_choice->length() == 0) && !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) { if ((word->unlv_crunch_mode != CR_DELETE) && (!stats_.tilde_crunch_written || ((word->unlv_crunch_mode == CR_KEEP_SPACE) && (word->word->space () > 0) && !word->word->flag (W_FUZZY_NON) && !word->word->flag (W_FUZZY_SP)))) { if (!word->word->flag (W_BOL) && (word->word->space () > 0) && !word->word->flag (W_FUZZY_NON) && !word->word->flag (W_FUZZY_SP)) { // Write a space to separate from preceeding good text. txt_chs[txt_index] = ' '; map_chs[txt_index++] = '1'; ep_chars[ep_chars_index++] = ' '; stats_.last_char_was_tilde = false; } need_reject = TRUE; } if ((need_reject && !stats_.last_char_was_tilde) || (force_eol && stats_.write_results_empty_block)) { /* Write a reject char - mark as rejected unless zero_rejection mode */ stats_.last_char_was_tilde = TRUE; txt_chs[txt_index] = unrecognised; if (tessedit_zero_rejection || (suspect_level == 0)) { map_chs[txt_index++] = '1'; ep_chars[ep_chars_index++] = unrecognised; } else { map_chs[txt_index++] = '0'; /* The ep_choice string is a faked reject to allow newdiff to sync the .etx with the .txt and .map files. */ ep_chars[ep_chars_index++] = CTRL_INSET; // escape code //dummy reject ep_chars[ep_chars_index++] = 1; //dummy reject ep_chars[ep_chars_index++] = 1; //type ep_chars[ep_chars_index++] = 2; //dummy reject ep_chars[ep_chars_index++] = 1; //dummy reject ep_chars[ep_chars_index++] = 1; } stats_.tilde_crunch_written = true; stats_.last_char_was_newline = false; stats_.write_results_empty_block = false; } if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) { /* Add a new line output */ txt_chs[txt_index] = '\n'; map_chs[txt_index++] = '\n'; //end line ep_chars[ep_chars_index++] = newline_type; //Cos of the real newline stats_.tilde_crunch_written = false; stats_.last_char_was_newline = true; stats_.last_char_was_tilde = false; } txt_chs[txt_index] = '\0'; map_chs[txt_index] = '\0'; ep_chars[ep_chars_index] = '\0'; // terminate string word->ep_choice = new WERD_CHOICE(ep_chars, uchset); if (force_eol) stats_.write_results_empty_block = true; return; } /* NORMAL PROCESSING of non tilde crunched words */ stats_.tilde_crunch_written = false; if (newline_type) stats_.last_char_was_newline = true; else stats_.last_char_was_newline = false; stats_.write_results_empty_block = force_eol; // about to write a real word if (unlv_tilde_crunching && stats_.last_char_was_tilde && (word->word->space() == 0) && !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) && (word->best_choice->unichar_id(0) == space)) { /* Prevent adjacent tilde across words - we know that adjacent tildes within words have been removed */ word->best_choice->remove_unichar_id(0); if (word->best_choice->blob_choices() != NULL) { BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices()); if (!blob_choices_it.empty()) delete blob_choices_it.extract(); } word->reject_map.remove_pos (0); word->box_word->DeleteBox(0); } if (newline_type || (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) stats_.last_char_was_tilde = false; else { if (word->reject_map.length () > 0) { if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) stats_.last_char_was_tilde = true; else stats_.last_char_was_tilde = false; } else if (word->word->space () > 0) stats_.last_char_was_tilde = false; /* else it is unchanged as there are no output chars */ } ASSERT_HOST (word->best_choice->length() == word->reject_map.length()); set_unlv_suspects(word); check_debug_pt (word, 120); if (tessedit_rejection_debug) { tprintf ("Dict word: \"%s\": %d\n", word->best_choice->debug_string().string(), dict_word(*(word->best_choice))); } if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) { repetition_code = "|^~R"; wordstr_lengths = "\001\001\001\001"; repetition_code += uchset.id_to_unichar(get_rep_char(word)); wordstr_lengths += strlen(uchset.id_to_unichar(get_rep_char(word))); wordstr = &repetition_code; } else { if (tessedit_zero_rejection) { /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ for (i = 0; i < word->best_choice->length(); ++i) { if (word->reject_map[i].rejected()) word->reject_map[i].setrej_minimal_rej_accept(); } } if (tessedit_minimal_rejection) { /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ for (i = 0; i < word->best_choice->length(); ++i) { if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected()) word->reject_map[i].setrej_minimal_rej_accept(); } } } }