Exemplo n.º 1
0
PAGE_RES_IT* make_pseudo_word(PAGE_RES* page_res, const TBOX& selection_box) {
  PAGE_RES_IT pr_it(page_res);
  C_BLOB_LIST new_blobs;               // list of gathered blobs
  C_BLOB_IT new_blob_it = &new_blobs;  // iterator

  for (WERD_RES* word_res = pr_it.word(); word_res != NULL;
       word_res = pr_it.forward()) {
    WERD* word = word_res->word;
    if (word->bounding_box().overlap(selection_box)) {
      C_BLOB_IT blob_it(word->cblob_list());
      for (blob_it.mark_cycle_pt();
           !blob_it.cycled_list(); blob_it.forward()) {
        C_BLOB* blob = blob_it.data();
        if (blob->bounding_box().overlap(selection_box)) {
          new_blob_it.add_after_then_move(C_BLOB::deep_copy(blob));
        }
      }
      if (!new_blobs.empty()) {
        WERD* pseudo_word = new WERD(&new_blobs, 1, NULL);
        word_res = pr_it.InsertSimpleCloneWord(*word_res, pseudo_word);
        PAGE_RES_IT* it = new PAGE_RES_IT(page_res);
        while (it->word() != word_res && it->word() != NULL) it->forward();
        ASSERT_HOST(it->word() == word_res);
        return it;
      }
    }
  }
  return NULL;
}
Exemplo n.º 2
0
// page_res is non-const because the iterator doesn't know if you are going
// to change the items it points to! Really a const here though.
void Tesseract::blob_feature_display(PAGE_RES* page_res,
                                     const TBOX& selection_box) {
  PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
  if (it != NULL) {
    WERD_RES* word_res = it->word();
    word_res->x_height = it->row()->row->x_height();
    word_res->SetupForRecognition(unicharset, this, BestPix(),
                                  tessedit_ocr_engine_mode, NULL,
                                  classify_bln_numeric_mode,
                                  textord_use_cjk_fp_model,
                                  poly_allow_detailed_fx,
                                  it->row()->row, it->block()->block);
    TWERD* bln_word = word_res->chopped_word;
    TBLOB* bln_blob = bln_word->blobs[0];
    INT_FX_RESULT_STRUCT fx_info;
    GenericVector<INT_FEATURE_STRUCT> bl_features;
    GenericVector<INT_FEATURE_STRUCT> cn_features;
    Classify::ExtractFeatures(*bln_blob, classify_nonlinear_norm, &bl_features,
                              &cn_features, &fx_info, NULL);
    // Display baseline features.
    ScrollView* bl_win = CreateFeatureSpaceWindow("BL Features", 512, 0);
    ClearFeatureSpaceWindow(baseline, bl_win);
    for (int f = 0; f < bl_features.size(); ++f)
      RenderIntFeature(bl_win, &bl_features[f], ScrollView::GREEN);
    bl_win->Update();
    // Display cn features.
    ScrollView* cn_win = CreateFeatureSpaceWindow("CN Features", 512, 0);
    ClearFeatureSpaceWindow(character, cn_win);
    for (int f = 0; f < cn_features.size(); ++f)
      RenderIntFeature(cn_win, &cn_features[f], ScrollView::GREEN);
    cn_win->Update();

    it->DeleteCurrentWord();
    delete it;
  }
}
Exemplo n.º 3
0
    void Tesseract::output_pass(  //Tess output pass //send to api
            PAGE_RES_IT &page_res_it,
            const TBOX *target_word_box) {
        BLOCK_RES *block_of_last_word;
        BOOL8 force_eol;               //During output
        BLOCK *nextblock;              //block of next word
        WERD *nextword;                //next word

        page_res_it.restart_page();
        block_of_last_word = NULL;
        while (page_res_it.word() != NULL) {
            check_debug_pt(page_res_it.word(), 120);

            if (target_word_box) {

                TBOX current_word_box = page_res_it.word()->word->bounding_box();
                FCOORD center_pt((current_word_box.right() + current_word_box.left()) / 2,
                                 (current_word_box.bottom() + current_word_box.top()) / 2);
                if (!target_word_box->contains(center_pt)) {
                    page_res_it.forward();
                    continue;
                }

            }
            if (tessedit_write_block_separators &&
                block_of_last_word != page_res_it.block()) {
                block_of_last_word = page_res_it.block();
            }

            force_eol = (tessedit_write_block_separators &&
                         (page_res_it.block() != page_res_it.next_block())) ||
                        (page_res_it.next_word() == NULL);

            if (page_res_it.next_word() != NULL)
                nextword = page_res_it.next_word()->word;
            else
                nextword = NULL;
            if (page_res_it.next_block() != NULL)
                nextblock = page_res_it.next_block()->block;
            else
                nextblock = NULL;
            //regardless of tilde crunching
            write_results(page_res_it,
                          determine_newline_type(page_res_it.word()->word,
                                                 page_res_it.block()->block,
                                                 nextword, nextblock), force_eol);
            page_res_it.forward();
        }
    }
Exemplo n.º 4
0
/*************************************************************************
 * write_results()
 *
 * All recognition and rejection has now been done. Generate the following:
 *   .txt file     - giving the final best choices with NO highlighting
 *   .raw file     - giving the tesseract top choice output for each word
 *   .map file     - showing how the .txt file has been rejected in the .ep file
 *   epchoice list - a list of one element per word, containing the text for the
 *                   epaper. Reject strings are inserted.
 *   inset list    - a list of bounding boxes of reject insets - indexed by the
 *                   reject strings in the epchoice text.
 *************************************************************************/
    void Tesseract::write_results(PAGE_RES_IT &page_res_it,
                                  char newline_type,  // type of newline
                                  BOOL8 force_eol) {  // override tilde crunch?
        WERD_RES *word = page_res_it.word();
        const UNICHARSET &uchset = *word->uch_set;
        int i;
        BOOL8 need_reject = FALSE;
        UNICHAR_ID space = uchset.unichar_to_id(" ");

        if ((word->unlv_crunch_mode != CR_NONE ||
             word->best_choice->length() == 0) &&
            !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
            if ((word->unlv_crunch_mode != CR_DELETE) &&
                (!stats_.tilde_crunch_written ||
                 ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
                  (word->word->space() > 0) &&
                  !word->word->flag(W_FUZZY_NON) &&
                  !word->word->flag(W_FUZZY_SP)))) {
                if (!word->word->flag(W_BOL) &&
                    (word->word->space() > 0) &&
                    !word->word->flag(W_FUZZY_NON) &&
                    !word->word->flag(W_FUZZY_SP)) {
                    stats_.last_char_was_tilde = false;
                }
                need_reject = TRUE;
            }
            if ((need_reject && !stats_.last_char_was_tilde) ||
                (force_eol && stats_.write_results_empty_block)) {
                /* Write a reject char - mark as rejected unless zero_rejection mode */
                stats_.last_char_was_tilde = TRUE;
                stats_.tilde_crunch_written = true;
                stats_.last_char_was_newline = false;
                stats_.write_results_empty_block = false;
            }

            if ((word->word->flag(W_EOL) && !stats_.last_char_was_newline) || force_eol) {
                stats_.tilde_crunch_written = false;
                stats_.last_char_was_newline = true;
                stats_.last_char_was_tilde = false;
            }

            if (force_eol)
                stats_.write_results_empty_block = true;
            return;
        }

        /* NORMAL PROCESSING of non tilde crunched words */

        stats_.tilde_crunch_written = false;
        if (newline_type)
            stats_.last_char_was_newline = true;
        else
            stats_.last_char_was_newline = false;
        stats_.write_results_empty_block = force_eol;  // about to write a real word

        if (unlv_tilde_crunching &&
            stats_.last_char_was_tilde &&
            (word->word->space() == 0) &&
            !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
            (word->best_choice->unichar_id(0) == space)) {
            /* Prevent adjacent tilde across words - we know that adjacent tildes within
               words have been removed */
            word->MergeAdjacentBlobs(0);
        }
        if (newline_type ||
            (word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes))
            stats_.last_char_was_tilde = false;
        else {
            if (word->reject_map.length() > 0) {
                if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
                    stats_.last_char_was_tilde = true;
                else
                    stats_.last_char_was_tilde = false;
            }
            else if (word->word->space() > 0)
                stats_.last_char_was_tilde = false;
            /* else it is unchanged as there are no output chars */
        }

        ASSERT_HOST(word->best_choice->length() == word->reject_map.length());

        set_unlv_suspects(word);
        check_debug_pt(word, 120);
        if (tessedit_rejection_debug) {
            tprintf("Dict word: \"%s\": %d\n",
                    word->best_choice->debug_string().string(),
                    dict_word(*(word->best_choice)));
        }
        if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
            if (tessedit_zero_rejection) {
                /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
                for (i = 0; i < word->best_choice->length(); ++i) {
                    if (word->reject_map[i].rejected())
                        word->reject_map[i].setrej_minimal_rej_accept();
                }
            }
            if (tessedit_minimal_rejection) {
                /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
                for (i = 0; i < word->best_choice->length(); ++i) {
                    if ((word->best_choice->unichar_id(i) != space) &&
                        word->reject_map[i].rejected())
                        word->reject_map[i].setrej_minimal_rej_accept();
                }
            }
        }
    }
Exemplo n.º 5
0
/*************************************************************************
 * write_results()
 *
 * All recognition and rejection has now been done. Generate the following:
 *   .txt file     - giving the final best choices with NO highlighting
 *   .raw file     - giving the tesseract top choice output for each word
 *   .map file     - showing how the .txt file has been rejected in the .ep file
 *   epchoice list - a list of one element per word, containing the text for the
 *                   epaper. Reject strings are inserted.
 *   inset list    - a list of bounding boxes of reject insets - indexed by the
 *                   reject strings in the epchoice text.
 *************************************************************************/
void Tesseract::write_results(PAGE_RES_IT &page_res_it,
                              char newline_type,  // type of newline
                              BOOL8 force_eol) {  // override tilde crunch?
  WERD_RES *word = page_res_it.word();
  const UNICHARSET &uchset = *word->uch_set;
  STRING repetition_code;
  const STRING *wordstr;
  STRING wordstr_lengths;
  int i;
  char unrecognised = STRING (unrecognised_char)[0];
  char ep_chars[32];             //Only for unlv_tilde_crunch
  int ep_chars_index = 0;
  char txt_chs[32];              //Only for unlv_tilde_crunch
  char map_chs[32];              //Only for unlv_tilde_crunch
  int txt_index = 0;
  BOOL8 need_reject = FALSE;
  UNICHAR_ID space = uchset.unichar_to_id(" ");
  if ((word->unlv_crunch_mode != CR_NONE ||
       word->best_choice->length() == 0) &&
      !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
    if ((word->unlv_crunch_mode != CR_DELETE) &&
        (!stats_.tilde_crunch_written ||
         ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
          (word->word->space () > 0) &&
          !word->word->flag (W_FUZZY_NON) &&
          !word->word->flag (W_FUZZY_SP)))) {
      if (!word->word->flag (W_BOL) &&
          (word->word->space () > 0) &&
          !word->word->flag (W_FUZZY_NON) &&
          !word->word->flag (W_FUZZY_SP)) {
        // Write a space to separate from preceeding good text.
        txt_chs[txt_index] = ' ';
        map_chs[txt_index++] = '1';
        ep_chars[ep_chars_index++] = ' ';
        stats_.last_char_was_tilde = false;
      }
      need_reject = TRUE;
    }
    if ((need_reject && !stats_.last_char_was_tilde) ||
        (force_eol && stats_.write_results_empty_block)) {
      /* Write a reject char - mark as rejected unless zero_rejection mode */
      stats_.last_char_was_tilde = TRUE;
      txt_chs[txt_index] = unrecognised;
      if (tessedit_zero_rejection || (suspect_level == 0)) {
        map_chs[txt_index++] = '1';
        ep_chars[ep_chars_index++] = unrecognised;
      }
      else {
        map_chs[txt_index++] = '0';
        /*
           The ep_choice string is a faked reject to allow newdiff to sync the
           .etx with the .txt and .map files.
         */
        ep_chars[ep_chars_index++] = CTRL_INSET; // escape code
                                 //dummy reject
        ep_chars[ep_chars_index++] = 1;
                                 //dummy reject
        ep_chars[ep_chars_index++] = 1;
                                 //type
        ep_chars[ep_chars_index++] = 2;
                                 //dummy reject
        ep_chars[ep_chars_index++] = 1;
                                 //dummy reject
        ep_chars[ep_chars_index++] = 1;
      }
      stats_.tilde_crunch_written = true;
      stats_.last_char_was_newline = false;
      stats_.write_results_empty_block = false;
    }

    if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
      /* Add a new line output */
      txt_chs[txt_index] = '\n';
      map_chs[txt_index++] = '\n';
                                 //end line
      ep_chars[ep_chars_index++] = newline_type;

                                 //Cos of the real newline
      stats_.tilde_crunch_written = false;
      stats_.last_char_was_newline = true;
      stats_.last_char_was_tilde = false;
    }
    txt_chs[txt_index] = '\0';
    map_chs[txt_index] = '\0';
    ep_chars[ep_chars_index] = '\0';  // terminate string
    word->ep_choice = new WERD_CHOICE(ep_chars, uchset);

    if (force_eol)
      stats_.write_results_empty_block = true;
    return;
  }

  /* NORMAL PROCESSING of non tilde crunched words */

  stats_.tilde_crunch_written = false;
  if (newline_type)
    stats_.last_char_was_newline = true;
  else
    stats_.last_char_was_newline = false;
  stats_.write_results_empty_block = force_eol;  // about to write a real word

  if (unlv_tilde_crunching &&
      stats_.last_char_was_tilde &&
      (word->word->space() == 0) &&
      !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
      (word->best_choice->unichar_id(0) == space)) {
    /* Prevent adjacent tilde across words - we know that adjacent tildes within
       words have been removed */
    word->best_choice->remove_unichar_id(0);
    if (word->best_choice->blob_choices() != NULL) {
      BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices());
      if (!blob_choices_it.empty()) delete blob_choices_it.extract();
    }
    word->reject_map.remove_pos (0);
    word->box_word->DeleteBox(0);
  }
  if (newline_type ||
    (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
    stats_.last_char_was_tilde = false;
  else {
    if (word->reject_map.length () > 0) {
      if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
        stats_.last_char_was_tilde = true;
      else
        stats_.last_char_was_tilde = false;
    }
    else if (word->word->space () > 0)
      stats_.last_char_was_tilde = false;
    /* else it is unchanged as there are no output chars */
  }

  ASSERT_HOST (word->best_choice->length() == word->reject_map.length());

  set_unlv_suspects(word);
  check_debug_pt (word, 120);
  if (tessedit_rejection_debug) {
    tprintf ("Dict word: \"%s\": %d\n",
             word->best_choice->debug_string().string(),
             dict_word(*(word->best_choice)));
  }
  if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
    repetition_code = "|^~R";
    wordstr_lengths = "\001\001\001\001";
    repetition_code += uchset.id_to_unichar(get_rep_char(word));
    wordstr_lengths += strlen(uchset.id_to_unichar(get_rep_char(word)));
    wordstr = &repetition_code;
  } else {
    if (tessedit_zero_rejection) {
      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
      for (i = 0; i < word->best_choice->length(); ++i) {
        if (word->reject_map[i].rejected())
          word->reject_map[i].setrej_minimal_rej_accept();
      }
    }
    if (tessedit_minimal_rejection) {
      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
      for (i = 0; i < word->best_choice->length(); ++i) {
        if ((word->best_choice->unichar_id(i) != space) &&
            word->reject_map[i].rejected())
          word->reject_map[i].setrej_minimal_rej_accept();
      }
    }
  }
}