/** * @name fix_fuzzy_spaces() * Walk over the page finding sequences of words joined by fuzzy spaces. Extract * them as a sublist, process the sublist to find the optimal arrangement of * spaces then replace the sublist in the ROW_RES. * * @param monitor progress monitor * @param word_count count of words in doc * @param[out] page_res */ void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor, inT32 word_count, PAGE_RES *page_res) { BLOCK_RES_IT block_res_it; ROW_RES_IT row_res_it; WERD_RES_IT word_res_it_from; WERD_RES_IT word_res_it_to; WERD_RES *word_res; WERD_RES_LIST fuzzy_space_words; inT16 new_length; BOOL8 prevent_null_wd_fixsp; // DONT process blobless wds inT32 word_index; // current word block_res_it.set_to_list(&page_res->block_res_list); word_index = 0; for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list(); block_res_it.forward()) { row_res_it.set_to_list(&block_res_it.data()->row_res_list); for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); row_res_it.forward()) { word_res_it_from.set_to_list(&row_res_it.data()->word_res_list); while (!word_res_it_from.at_last()) { word_res = word_res_it_from.data(); while (!word_res_it_from.at_last() && !(word_res->combination || word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) || word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) { fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block); word_res = word_res_it_from.forward(); word_index++; if (monitor != NULL) { monitor->ocr_alive = TRUE; monitor->progress = 90 + 5 * word_index / word_count; if (monitor->deadline_exceeded() || (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) return; } } if (!word_res_it_from.at_last()) { word_res_it_to = word_res_it_from; prevent_null_wd_fixsp = word_res->word->cblob_list()->empty(); if (check_debug_pt(word_res, 60)) debug_fix_space_level.set_value(10); word_res_it_to.forward(); word_index++; if (monitor != NULL) { monitor->ocr_alive = TRUE; monitor->progress = 90 + 5 * word_index / word_count; if (monitor->deadline_exceeded() || (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) return; } while (!word_res_it_to.at_last () && (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) || word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) { if (check_debug_pt(word_res, 60)) debug_fix_space_level.set_value(10); if (word_res->word->cblob_list()->empty()) prevent_null_wd_fixsp = TRUE; word_res = word_res_it_to.forward(); } if (check_debug_pt(word_res, 60)) debug_fix_space_level.set_value(10); if (word_res->word->cblob_list()->empty()) prevent_null_wd_fixsp = TRUE; if (prevent_null_wd_fixsp) { word_res_it_from = word_res_it_to; } else { fuzzy_space_words.assign_to_sublist(&word_res_it_from, &word_res_it_to); fix_fuzzy_space_list(fuzzy_space_words, row_res_it.data()->row, block_res_it.data()->block); new_length = fuzzy_space_words.length(); word_res_it_from.add_list_before(&fuzzy_space_words); for (; !word_res_it_from.at_last() && new_length > 0; new_length--) { word_res_it_from.forward(); } } if (test_pt) debug_fix_space_level.set_value(0); } fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block); // Last word in row } } } }
void fix_fuzzy_spaces( //find fuzzy words volatile ETEXT_DESC *monitor, //progress monitor inT32 word_count, //count of words in doc PAGE_RES *page_res) { BLOCK_RES_IT block_res_it; //iterators ROW_RES_IT row_res_it; WERD_RES_IT word_res_it_from; WERD_RES_IT word_res_it_to; WERD_RES *word_res; WERD_RES_LIST fuzzy_space_words; inT16 new_length; BOOL8 prevent_null_wd_fixsp; //DONT process blobless wds inT32 word_index; //current word block_res_it.set_to_list (&page_res->block_res_list); word_index = 0; for (block_res_it.mark_cycle_pt (); !block_res_it.cycled_list (); block_res_it.forward ()) { row_res_it.set_to_list (&block_res_it.data ()->row_res_list); for (row_res_it.mark_cycle_pt (); !row_res_it.cycled_list (); row_res_it.forward ()) { word_res_it_from.set_to_list (&row_res_it.data ()->word_res_list); while (!word_res_it_from.at_last ()) { word_res = word_res_it_from.data (); while (!word_res_it_from.at_last () && !(word_res->combination || word_res_it_from.data_relative (1)-> word->flag (W_FUZZY_NON) || word_res_it_from.data_relative (1)-> word->flag (W_FUZZY_SP))) { fix_sp_fp_word (word_res_it_from, row_res_it.data ()->row); word_res = word_res_it_from.forward (); word_index++; if (monitor != NULL) { monitor->ocr_alive = TRUE; monitor->progress = 90 + 5 * word_index / word_count; } } if (!word_res_it_from.at_last ()) { word_res_it_to = word_res_it_from; prevent_null_wd_fixsp = word_res->word->gblob_list ()->empty (); if (check_debug_pt (word_res, 60)) debug_fix_space_level.set_value (10); word_res_it_to.forward (); word_index++; if (monitor != NULL) { monitor->ocr_alive = TRUE; monitor->progress = 90 + 5 * word_index / word_count; } while (!word_res_it_to.at_last () && (word_res_it_to.data_relative (1)-> word->flag (W_FUZZY_NON) || word_res_it_to.data_relative (1)-> word->flag (W_FUZZY_SP))) { if (check_debug_pt (word_res, 60)) debug_fix_space_level.set_value (10); if (word_res->word->gblob_list ()->empty ()) prevent_null_wd_fixsp = TRUE; word_res = word_res_it_to.forward (); } if (check_debug_pt (word_res, 60)) debug_fix_space_level.set_value (10); if (word_res->word->gblob_list ()->empty ()) prevent_null_wd_fixsp = TRUE; if (prevent_null_wd_fixsp) word_res_it_from = word_res_it_to; else { fuzzy_space_words.assign_to_sublist (&word_res_it_from, &word_res_it_to); fix_fuzzy_space_list (fuzzy_space_words, row_res_it.data ()->row); new_length = fuzzy_space_words.length (); word_res_it_from.add_list_before (&fuzzy_space_words); for (; (!word_res_it_from.at_last () && (new_length > 0)); new_length--) { word_res_it_from.forward (); } } if (test_pt) debug_fix_space_level.set_value (0); } fix_sp_fp_word (word_res_it_from, row_res_it.data ()->row); //Last word in row } } } }