Esempio n. 1
0
/**
 * @name fix_fuzzy_spaces()
 * Walk over the page finding sequences of words joined by fuzzy spaces. Extract
 * them as a sublist, process the sublist to find the optimal arrangement of
 * spaces then replace the sublist in the ROW_RES.
 *
 * @param monitor progress monitor
 * @param word_count count of words in doc
 * @param[out] page_res
 */
void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor,
                                 inT32 word_count,
                                 PAGE_RES *page_res) {
  BLOCK_RES_IT block_res_it;
  ROW_RES_IT row_res_it;
  WERD_RES_IT word_res_it_from;
  WERD_RES_IT word_res_it_to;
  WERD_RES *word_res;
  WERD_RES_LIST fuzzy_space_words;
  inT16 new_length;
  BOOL8 prevent_null_wd_fixsp;   // DONT process blobless wds
  inT32 word_index;              // current word

  block_res_it.set_to_list(&page_res->block_res_list);
  word_index = 0;
  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
       block_res_it.forward()) {
    row_res_it.set_to_list(&block_res_it.data()->row_res_list);
    for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
         row_res_it.forward()) {
      word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
      while (!word_res_it_from.at_last()) {
        word_res = word_res_it_from.data();
        while (!word_res_it_from.at_last() &&
               !(word_res->combination ||
                 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
                 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
          fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
                         block_res_it.data()->block);
          word_res = word_res_it_from.forward();
          word_index++;
          if (monitor != NULL) {
            monitor->ocr_alive = TRUE;
            monitor->progress = 90 + 5 * word_index / word_count;
            if (monitor->deadline_exceeded() ||
                (monitor->cancel != NULL &&
                 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
            return;
          }
        }

        if (!word_res_it_from.at_last()) {
          word_res_it_to = word_res_it_from;
          prevent_null_wd_fixsp =
            word_res->word->cblob_list()->empty();
          if (check_debug_pt(word_res, 60))
            debug_fix_space_level.set_value(10);
          word_res_it_to.forward();
          word_index++;
          if (monitor != NULL) {
            monitor->ocr_alive = TRUE;
            monitor->progress = 90 + 5 * word_index / word_count;
            if (monitor->deadline_exceeded() ||
                (monitor->cancel != NULL &&
                 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
            return;
          }
          while (!word_res_it_to.at_last () &&
                 (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
                  word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
            if (check_debug_pt(word_res, 60))
              debug_fix_space_level.set_value(10);
            if (word_res->word->cblob_list()->empty())
              prevent_null_wd_fixsp = TRUE;
            word_res = word_res_it_to.forward();
          }
          if (check_debug_pt(word_res, 60))
            debug_fix_space_level.set_value(10);
          if (word_res->word->cblob_list()->empty())
            prevent_null_wd_fixsp = TRUE;
          if (prevent_null_wd_fixsp) {
            word_res_it_from = word_res_it_to;
          } else {
            fuzzy_space_words.assign_to_sublist(&word_res_it_from,
                                                &word_res_it_to);
            fix_fuzzy_space_list(fuzzy_space_words,
                                 row_res_it.data()->row,
                                 block_res_it.data()->block);
            new_length = fuzzy_space_words.length();
            word_res_it_from.add_list_before(&fuzzy_space_words);
            for (;
                 !word_res_it_from.at_last() && new_length > 0;
                 new_length--) {
              word_res_it_from.forward();
            }
          }
          if (test_pt)
            debug_fix_space_level.set_value(0);
        }
        fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
                       block_res_it.data()->block);
        // Last word in row
      }
    }
  }
}
void fix_fuzzy_spaces(                               //find fuzzy words
                      volatile ETEXT_DESC *monitor,  //progress monitor
                      inT32 word_count,              //count of words in doc
                      PAGE_RES *page_res) {
  BLOCK_RES_IT block_res_it;     //iterators
  ROW_RES_IT row_res_it;
  WERD_RES_IT word_res_it_from;
  WERD_RES_IT word_res_it_to;
  WERD_RES *word_res;
  WERD_RES_LIST fuzzy_space_words;
  inT16 new_length;
  BOOL8 prevent_null_wd_fixsp;   //DONT process blobless wds
  inT32 word_index;              //current word

  block_res_it.set_to_list (&page_res->block_res_list);
  word_index = 0;
  for (block_res_it.mark_cycle_pt ();
  !block_res_it.cycled_list (); block_res_it.forward ()) {
    row_res_it.set_to_list (&block_res_it.data ()->row_res_list);
    for (row_res_it.mark_cycle_pt ();
    !row_res_it.cycled_list (); row_res_it.forward ()) {
      word_res_it_from.set_to_list (&row_res_it.data ()->word_res_list);
      while (!word_res_it_from.at_last ()) {
        word_res = word_res_it_from.data ();
        while (!word_res_it_from.at_last () &&
          !(word_res->combination ||
          word_res_it_from.data_relative (1)->
          word->flag (W_FUZZY_NON) ||
          word_res_it_from.data_relative (1)->
        word->flag (W_FUZZY_SP))) {
          fix_sp_fp_word (word_res_it_from, row_res_it.data ()->row);
          word_res = word_res_it_from.forward ();
          word_index++;
          if (monitor != NULL) {
            monitor->ocr_alive = TRUE;
            monitor->progress = 90 + 5 * word_index / word_count;
          }
        }

        if (!word_res_it_from.at_last ()) {
          word_res_it_to = word_res_it_from;
          prevent_null_wd_fixsp =
            word_res->word->gblob_list ()->empty ();
          if (check_debug_pt (word_res, 60))
            debug_fix_space_level.set_value (10);
          word_res_it_to.forward ();
          word_index++;
          if (monitor != NULL) {
            monitor->ocr_alive = TRUE;
            monitor->progress = 90 + 5 * word_index / word_count;
          }
          while (!word_res_it_to.at_last () &&
            (word_res_it_to.data_relative (1)->
            word->flag (W_FUZZY_NON) ||
            word_res_it_to.data_relative (1)->
          word->flag (W_FUZZY_SP))) {
            if (check_debug_pt (word_res, 60))
              debug_fix_space_level.set_value (10);
            if (word_res->word->gblob_list ()->empty ())
              prevent_null_wd_fixsp = TRUE;
            word_res = word_res_it_to.forward ();
          }
          if (check_debug_pt (word_res, 60))
            debug_fix_space_level.set_value (10);
          if (word_res->word->gblob_list ()->empty ())
            prevent_null_wd_fixsp = TRUE;
          if (prevent_null_wd_fixsp)
            word_res_it_from = word_res_it_to;
          else {
            fuzzy_space_words.assign_to_sublist (&word_res_it_from,
              &word_res_it_to);
            fix_fuzzy_space_list (fuzzy_space_words,
              row_res_it.data ()->row);
            new_length = fuzzy_space_words.length ();
            word_res_it_from.add_list_before (&fuzzy_space_words);
            for (;
              (!word_res_it_from.at_last () &&
            (new_length > 0)); new_length--) {
              word_res_it_from.forward ();
            }
          }
          if (test_pt)
            debug_fix_space_level.set_value (0);
        }
        fix_sp_fp_word (word_res_it_from, row_res_it.data ()->row);
        //Last word in row
      }
    }
  }
}