// Make a text string from the internal data structures. // The input page_res is deleted. char* TessBaseAPI::TesseractToText(PAGE_RES* page_res) { if (page_res != NULL) { int total_length = TextLength(page_res); PAGE_RES_IT page_res_it(page_res); char* result = new char[total_length]; char* ptr = result; for (page_res_it.restart_page(); page_res_it.word () != NULL; page_res_it.forward()) { WERD_RES *word = page_res_it.word(); WERD_CHOICE* choice = word->best_choice; if (choice != NULL) { strcpy(ptr, choice->string().string()); ptr += strlen(ptr); if (word->word->flag(W_EOL)) *ptr++ = '\n'; else *ptr++ = ' '; } } *ptr++ = '\n'; *ptr = '\0'; delete page_res; return result; } return NULL; }
/********************************************************************** * run_cube_combiner * * Iterates through tesseract's results and calls cube on each word, * combining the results with the existing tesseract result. **********************************************************************/ void Tesseract::run_cube_combiner(PAGE_RES *page_res) { if (page_res == NULL || tess_cube_combiner_ == NULL) return; PAGE_RES_IT page_res_it(page_res); // Iterate through the word results and call cube on each word. for (page_res_it.restart_page(); page_res_it.word () != NULL; page_res_it.forward()) { BLOCK* block = page_res_it.block()->block; if (block->poly_block() != NULL && !block->poly_block()->IsText()) continue; // Don't deal with non-text blocks. WERD_RES* word = page_res_it.word(); // Skip cube entirely if tesseract's certainty is greater than threshold. int combiner_run_thresh = convert_prob_to_tess_certainty( cube_cntxt_->Params()->CombinerRunThresh()); if (word->best_choice->certainty() >= combiner_run_thresh) { continue; } // Use the same language as Tesseract used for the word. Tesseract* lang_tess = word->tesseract; // Setup a trial WERD_RES in which to classify with cube. WERD_RES cube_word; cube_word.InitForRetryRecognition(*word); cube_word.SetupForRecognition(lang_tess->unicharset, this, BestPix(), OEM_CUBE_ONLY, NULL, false, false, false, page_res_it.row()->row, page_res_it.block()->block); CubeObject *cube_obj = lang_tess->cube_recognize_word( page_res_it.block()->block, &cube_word); if (cube_obj != NULL) lang_tess->cube_combine_word(cube_obj, &cube_word, word); delete cube_obj; } }
/********************************************************************** * run_cube_combiner * * Iterates through tesseract's results and calls cube on each word, * combining the results with the existing tesseract result. **********************************************************************/ void Tesseract::run_cube_combiner(PAGE_RES *page_res) { if (page_res == NULL || tess_cube_combiner_ == NULL) return; PAGE_RES_IT page_res_it(page_res); // Iterate through the word results and call cube on each word. for (page_res_it.restart_page(); page_res_it.word () != NULL; page_res_it.forward()) { WERD_RES* word = page_res_it.word(); // Skip cube entirely if tesseract's certainty is greater than threshold. int combiner_run_thresh = convert_prob_to_tess_certainty( cube_cntxt_->Params()->CombinerRunThresh()); if (word->best_choice->certainty() >= combiner_run_thresh) { continue; } // Use the same language as Tesseract used for the word. Tesseract* lang_tess = word->tesseract; // Setup a trial WERD_RES in which to classify with cube. WERD_RES cube_word; cube_word.InitForRetryRecognition(*word); CubeObject *cube_obj = lang_tess->cube_recognize_word( page_res_it.block()->block, &cube_word); if (cube_obj != NULL) lang_tess->cube_combine_word(cube_obj, &cube_word, word); delete cube_obj; } }
// Extract the OCR results, costs (penalty points for uncertainty), // and the bounding boxes of the characters. static void extract_result(ELIST_ITERATOR *out, PAGE_RES* page_res) { PAGE_RES_IT page_res_it(page_res); int word_count = 0; while (page_res_it.word() != NULL) { WERD_RES *word = page_res_it.word(); const char *str = word->best_choice->string().string(); const char *len = word->best_choice->lengths().string(); if (word_count) add_space(out); TBOX bln_rect; PBLOB_LIST *blobs = word->outword->blob_list(); PBLOB_IT it(blobs); int n = strlen(len); TBOX** boxes_to_fix = new TBOX*[n]; for (int i = 0; i < n; i++) { PBLOB *blob = it.data(); TBOX current = blob->bounding_box(); bln_rect = bln_rect.bounding_union(current); TESS_CHAR *tc = new TESS_CHAR(rating_to_cost(word->best_choice->rating()), str, *len); tc->box = current; boxes_to_fix[i] = &tc->box; out->add_after_then_move(tc); it.forward(); str += *len; len++; } // Find the word bbox before normalization. // Here we can't use the C_BLOB bboxes directly, // since connected letters are not yet cut. TBOX real_rect = word->word->bounding_box(); // Denormalize boxes by transforming the bbox of the whole bln word // into the denorm bbox (`real_rect') of the whole word. double x_stretch = double(real_rect.width()) / bln_rect.width(); double y_stretch = double(real_rect.height()) / bln_rect.height(); for (int j = 0; j < n; j++) { TBOX *box = boxes_to_fix[j]; int x0 = int(real_rect.left() + x_stretch * (box->left() - bln_rect.left()) + 0.5); int x1 = int(real_rect.left() + x_stretch * (box->right() - bln_rect.left()) + 0.5); int y0 = int(real_rect.bottom() + y_stretch * (box->bottom() - bln_rect.bottom()) + 0.5); int y1 = int(real_rect.bottom() + y_stretch * (box->top() - bln_rect.bottom()) + 0.5); *box = TBOX(ICOORD(x0, y0), ICOORD(x1, y1)); } delete [] boxes_to_fix; page_res_it.forward(); word_count++; } }
/** * @name process_selected_words() * * Walk the current block list applying the specified word processor function * to each word that overlaps the selection_box. */ void Tesseract::process_selected_words( PAGE_RES* page_res, // blocks to check TBOX & selection_box, BOOL8(tesseract::Tesseract::*word_processor)(PAGE_RES_IT* pr_it)) { for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != NULL; page_res_it.forward()) { WERD* word = page_res_it.word()->word; if (word->bounding_box().overlap(selection_box)) { if (!(this->*word_processor)(&page_res_it)) return; } } }
// Make a text string from the internal data structures. // The input page_res is deleted. // The text string takes the form of a box file as needed for training. char* TessBaseAPI::TesseractToBoxText(PAGE_RES* page_res, int left, int bottom) { if (page_res != NULL) { int total_length = TextLength(page_res) * kMaxCharsPerChar; PAGE_RES_IT page_res_it(page_res); char* result = new char[total_length]; char* ptr = result; for (page_res_it.restart_page(); page_res_it.word () != NULL; page_res_it.forward()) { WERD_RES *word = page_res_it.word(); ptr += ConvertWordToBoxText(word,page_res_it.row(),left, bottom, ptr); } *ptr = '\0'; delete page_res; return result; } return NULL; }
// Return the maximum length that the output text string might occupy. int TessBaseAPI::TextLength(PAGE_RES* page_res) { PAGE_RES_IT page_res_it(page_res); int total_length = 2; // Iterate over the data structures to extract the recognition result. for (page_res_it.restart_page(); page_res_it.word () != NULL; page_res_it.forward()) { WERD_RES *word = page_res_it.word(); WERD_CHOICE* choice = word->best_choice; if (choice != NULL) { total_length += choice->string().length() + 1; for (int i = 0; i < word->reject_map.length(); ++i) { if (word->reject_map[i].rejected()) ++total_length; } } } return total_length; }
char* TessBaseAPI::TesseractToUNLV(PAGE_RES* page_res) { bool tilde_crunch_written = false; bool last_char_was_newline = true; bool last_char_was_tilde = false; if (page_res != NULL) { int total_length = TextLength(page_res); PAGE_RES_IT page_res_it(page_res); char* result = new char[total_length]; char* ptr = result; for (page_res_it.restart_page(); page_res_it.word () != NULL; page_res_it.forward()) { WERD_RES *word = page_res_it.word(); // Process the current word. if (word->unlv_crunch_mode != CR_NONE) { if (word->unlv_crunch_mode != CR_DELETE && (!tilde_crunch_written || (word->unlv_crunch_mode == CR_KEEP_SPACE && word->word->space () > 0 && !word->word->flag (W_FUZZY_NON) && !word->word->flag (W_FUZZY_SP)))) { if (!word->word->flag (W_BOL) && word->word->space () > 0 && !word->word->flag (W_FUZZY_NON) && !word->word->flag (W_FUZZY_SP)) { /* Write a space to separate from preceeding good text */ *ptr++ = ' '; last_char_was_tilde = false; } if (!last_char_was_tilde) { // Write a reject char. last_char_was_tilde = true; *ptr++ = kUnrecognized; tilde_crunch_written = true; last_char_was_newline = false; } } } else { // NORMAL PROCESSING of non tilde crunched words. tilde_crunch_written = false; if (last_char_was_tilde && word->word->space () == 0 && (word->best_choice->string ()[0] == ' ')) { /* Prevent adjacent tilde across words - we know that adjacent tildes within words have been removed */ char* p = (char *) word->best_choice->string().string (); strcpy (p, p + 1); //shuffle up p = (char *) word->best_choice->lengths().string (); strcpy (p, p + 1); //shuffle up word->reject_map.remove_pos (0); PBLOB_IT blob_it = word->outword->blob_list (); delete blob_it.extract (); //get rid of reject blob } if (word->word->flag(W_REP_CHAR) && tessedit_consistent_reps) ensure_rep_chars_are_consistent(word); set_unlv_suspects(word); const char* wordstr = word->best_choice->string().string(); if (wordstr[0] != 0) { if (!last_char_was_newline) *ptr++ = ' '; else last_char_was_newline = false; int offset = 0; const STRING& lengths = word->best_choice->lengths(); int length = lengths.length(); for (int i = 0; i < length; offset += lengths[i++]) { if (wordstr[offset] == ' ' || wordstr[offset] == '~' || wordstr[offset] == '|') { *ptr++ = kUnrecognized; last_char_was_tilde = true; } else { if (word->reject_map[i].rejected()) *ptr++ = '^'; UNICHAR ch(wordstr + offset, lengths[i]); int uni_ch = ch.first_uni(); for (int j = 0; kUniChs[j] != 0; ++j) { if (kUniChs[j] == uni_ch) { uni_ch = kLatinChs[j]; break; } } if (uni_ch <= 0xff) { *ptr++ = static_cast<char>(uni_ch); last_char_was_tilde = false; } else { *ptr++ = kUnrecognized; last_char_was_tilde = true; } } } } } if (word->word->flag(W_EOL) && !last_char_was_newline) { /* Add a new line output */ *ptr++ = '\n'; tilde_crunch_written = false; last_char_was_newline = true; last_char_was_tilde = false; } } *ptr++ = '\n'; *ptr = '\0'; delete page_res; return result; } return NULL; }
/// Gather consecutive blobs that match the given box into the best_state /// and corresponding correct_text. /// /// Fights over which box owns which blobs are settled by pre-chopping and /// applying the blobs to box or next_box with the least non-overlap. /// @return false if the box was in error, which can only be caused by /// failing to find an appropriate blob for a box. /// /// This means that occasionally, blobs may be incorrectly segmented if the /// chopper fails to find a suitable chop point. bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box, const TBOX& box, const TBOX& next_box, const char* correct_text) { if (applybox_debug > 1) { tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text); } PAGE_RES_IT page_res_it(page_res); WERD_RES* word_res; for (word_res = page_res_it.word(); word_res != NULL; word_res = page_res_it.forward()) { if (!word_res->box_word->bounding_box().major_overlap(box)) continue; if (applybox_debug > 1) { tprintf("Checking word box:"); word_res->box_word->bounding_box().print(); } int word_len = word_res->box_word->length(); for (int i = 0; i < word_len; ++i) { TBOX char_box = TBOX(); int blob_count = 0; for (blob_count = 0; i + blob_count < word_len; ++blob_count) { TBOX blob_box = word_res->box_word->BlobBox(i + blob_count); if (!blob_box.major_overlap(box)) break; if (word_res->correct_text[i + blob_count].length() > 0) break; // Blob is claimed already. double current_box_miss_metric = BoxMissMetric(blob_box, box); double next_box_miss_metric = BoxMissMetric(blob_box, next_box); if (applybox_debug > 2) { tprintf("Checking blob:"); blob_box.print(); tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric, next_box_miss_metric); } if (current_box_miss_metric > next_box_miss_metric) break; // Blob is a better match for next box. char_box += blob_box; } if (blob_count > 0) { if (applybox_debug > 1) { tprintf("Index [%d, %d) seem good.\n", i, i + blob_count); } if (!char_box.almost_equal(box, 3) && (box.x_gap(next_box) < -3 || (prev_box != NULL && prev_box->x_gap(box) < -3))) { return false; } // We refine just the box_word, best_state and correct_text here. // The rebuild_word is made in TidyUp. // blob_count blobs are put together to match the box. Merge the // box_word boxes, save the blob_count in the state and the text. word_res->box_word->MergeBoxes(i, i + blob_count); word_res->best_state[i] = blob_count; word_res->correct_text[i] = correct_text; if (applybox_debug > 2) { tprintf("%d Blobs match: blob box:", blob_count); word_res->box_word->BlobBox(i).print(); tprintf("Matches box:"); box.print(); tprintf("With next box:"); next_box.print(); } // Eliminated best_state and correct_text entries for the consumed // blobs. for (int j = 1; j < blob_count; ++j) { word_res->best_state.remove(i + 1); word_res->correct_text.remove(i + 1); } // Assume that no box spans multiple source words, so we are done with // this box. if (applybox_debug > 1) { tprintf("Best state = "); for (int j = 0; j < word_res->best_state.size(); ++j) { tprintf("%d ", word_res->best_state[j]); } tprintf("\n"); tprintf("Correct text = [[ "); for (int j = 0; j < word_res->correct_text.size(); ++j) { tprintf("%s ", word_res->correct_text[j].string()); } tprintf("]]\n"); } return true; } } } if (applybox_debug > 0) { tprintf("FAIL!\n"); } return false; // Failure. }