示例#1
0
char* TessBaseAPI::TesseractToUNLV(PAGE_RES* page_res) {
  bool tilde_crunch_written = false;
  bool last_char_was_newline = true;
  bool last_char_was_tilde = false;

  if (page_res != NULL) {
    int total_length = TextLength(page_res);
    PAGE_RES_IT   page_res_it(page_res);
    char* result = new char[total_length];
    char* ptr = result;
    for (page_res_it.restart_page(); page_res_it.word () != NULL;
         page_res_it.forward()) {
      WERD_RES *word = page_res_it.word();
      // Process the current word.
      if (word->unlv_crunch_mode != CR_NONE) {
        if (word->unlv_crunch_mode != CR_DELETE &&
            (!tilde_crunch_written ||
             (word->unlv_crunch_mode == CR_KEEP_SPACE &&
              word->word->space () > 0 &&
              !word->word->flag (W_FUZZY_NON) &&
              !word->word->flag (W_FUZZY_SP)))) {
          if (!word->word->flag (W_BOL) &&
              word->word->space () > 0 &&
              !word->word->flag (W_FUZZY_NON) &&
              !word->word->flag (W_FUZZY_SP)) {
            /* Write a space to separate from preceeding good text */
            *ptr++ = ' ';
            last_char_was_tilde = false;
          }
          if (!last_char_was_tilde) {
            // Write a reject char.
            last_char_was_tilde = true;
            *ptr++ = kUnrecognized;
            tilde_crunch_written = true;
            last_char_was_newline = false;
          }
        }
      } else {
        // NORMAL PROCESSING of non tilde crunched words.
        tilde_crunch_written = false;

        if (last_char_was_tilde &&
            word->word->space () == 0 &&
            (word->best_choice->string ()[0] == ' ')) {
          /* Prevent adjacent tilde across words - we know that adjacent tildes within
             words have been removed */
          char* p = (char *) word->best_choice->string().string ();
          strcpy (p, p + 1);       //shuffle up
          p = (char *) word->best_choice->lengths().string ();
          strcpy (p, p + 1);       //shuffle up
          word->reject_map.remove_pos (0);
          PBLOB_IT blob_it = word->outword->blob_list ();
          delete blob_it.extract ();   //get rid of reject blob
        }

        if (word->word->flag(W_REP_CHAR) && tessedit_consistent_reps)
          ensure_rep_chars_are_consistent(word);

        set_unlv_suspects(word);
        const char* wordstr = word->best_choice->string().string();
        if (wordstr[0] != 0) {
          if (!last_char_was_newline)
            *ptr++ = ' ';
          else
            last_char_was_newline = false;
          int offset = 0;
          const STRING& lengths = word->best_choice->lengths();
          int length = lengths.length();
          for (int i = 0; i < length; offset += lengths[i++]) {
            if (wordstr[offset] == ' ' ||
                wordstr[offset] == '~' ||
                wordstr[offset] == '|') {
              *ptr++ = kUnrecognized;
              last_char_was_tilde = true;
            } else {
              if (word->reject_map[i].rejected())
                *ptr++ = '^';
              UNICHAR ch(wordstr + offset, lengths[i]);
              int uni_ch = ch.first_uni();
              for (int j = 0; kUniChs[j] != 0; ++j) {
                if (kUniChs[j] == uni_ch) {
                  uni_ch = kLatinChs[j];
                  break;
                }
              }
              if (uni_ch <= 0xff) {
                *ptr++ = static_cast<char>(uni_ch);
                last_char_was_tilde = false;
              } else {
                *ptr++ = kUnrecognized;
                last_char_was_tilde = true;
              }
            }
          }
        }
      }
      if (word->word->flag(W_EOL) && !last_char_was_newline) {
        /* Add a new line output */
        *ptr++ = '\n';
        tilde_crunch_written = false;
        last_char_was_newline = true;
        last_char_was_tilde = false;
      }
    }
    *ptr++ = '\n';
    *ptr = '\0';
    delete page_res;
    return result;
  }
  return NULL;
}
示例#2
0
/*************************************************************************
 * write_results()
 *
 * All recognition and rejection has now been done. Generate the following:
 *   .txt file     - giving the final best choices with NO highlighting
 *   .raw file     - giving the tesseract top choice output for each word
 *   .map file     - showing how the .txt file has been rejected in the .ep file
 *   epchoice list - a list of one element per word, containing the text for the
 *                   epaper. Reject strings are inserted.
 *   inset list    - a list of bounding boxes of reject insets - indexed by the
 *                   reject strings in the epchoice text.
 *************************************************************************/
    void Tesseract::write_results(PAGE_RES_IT &page_res_it,
                                  char newline_type,  // type of newline
                                  BOOL8 force_eol) {  // override tilde crunch?
        WERD_RES *word = page_res_it.word();
        const UNICHARSET &uchset = *word->uch_set;
        int i;
        BOOL8 need_reject = FALSE;
        UNICHAR_ID space = uchset.unichar_to_id(" ");

        if ((word->unlv_crunch_mode != CR_NONE ||
             word->best_choice->length() == 0) &&
            !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
            if ((word->unlv_crunch_mode != CR_DELETE) &&
                (!stats_.tilde_crunch_written ||
                 ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
                  (word->word->space() > 0) &&
                  !word->word->flag(W_FUZZY_NON) &&
                  !word->word->flag(W_FUZZY_SP)))) {
                if (!word->word->flag(W_BOL) &&
                    (word->word->space() > 0) &&
                    !word->word->flag(W_FUZZY_NON) &&
                    !word->word->flag(W_FUZZY_SP)) {
                    stats_.last_char_was_tilde = false;
                }
                need_reject = TRUE;
            }
            if ((need_reject && !stats_.last_char_was_tilde) ||
                (force_eol && stats_.write_results_empty_block)) {
                /* Write a reject char - mark as rejected unless zero_rejection mode */
                stats_.last_char_was_tilde = TRUE;
                stats_.tilde_crunch_written = true;
                stats_.last_char_was_newline = false;
                stats_.write_results_empty_block = false;
            }

            if ((word->word->flag(W_EOL) && !stats_.last_char_was_newline) || force_eol) {
                stats_.tilde_crunch_written = false;
                stats_.last_char_was_newline = true;
                stats_.last_char_was_tilde = false;
            }

            if (force_eol)
                stats_.write_results_empty_block = true;
            return;
        }

        /* NORMAL PROCESSING of non tilde crunched words */

        stats_.tilde_crunch_written = false;
        if (newline_type)
            stats_.last_char_was_newline = true;
        else
            stats_.last_char_was_newline = false;
        stats_.write_results_empty_block = force_eol;  // about to write a real word

        if (unlv_tilde_crunching &&
            stats_.last_char_was_tilde &&
            (word->word->space() == 0) &&
            !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
            (word->best_choice->unichar_id(0) == space)) {
            /* Prevent adjacent tilde across words - we know that adjacent tildes within
               words have been removed */
            word->MergeAdjacentBlobs(0);
        }
        if (newline_type ||
            (word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes))
            stats_.last_char_was_tilde = false;
        else {
            if (word->reject_map.length() > 0) {
                if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
                    stats_.last_char_was_tilde = true;
                else
                    stats_.last_char_was_tilde = false;
            }
            else if (word->word->space() > 0)
                stats_.last_char_was_tilde = false;
            /* else it is unchanged as there are no output chars */
        }

        ASSERT_HOST(word->best_choice->length() == word->reject_map.length());

        set_unlv_suspects(word);
        check_debug_pt(word, 120);
        if (tessedit_rejection_debug) {
            tprintf("Dict word: \"%s\": %d\n",
                    word->best_choice->debug_string().string(),
                    dict_word(*(word->best_choice)));
        }
        if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
            if (tessedit_zero_rejection) {
                /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
                for (i = 0; i < word->best_choice->length(); ++i) {
                    if (word->reject_map[i].rejected())
                        word->reject_map[i].setrej_minimal_rej_accept();
                }
            }
            if (tessedit_minimal_rejection) {
                /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
                for (i = 0; i < word->best_choice->length(); ++i) {
                    if ((word->best_choice->unichar_id(i) != space) &&
                        word->reject_map[i].rejected())
                        word->reject_map[i].setrej_minimal_rej_accept();
                }
            }
        }
    }
示例#3
0
/*************************************************************************
 * write_results()
 *
 * All recognition and rejection has now been done. Generate the following:
 *   .txt file     - giving the final best choices with NO highlighting
 *   .raw file     - giving the tesseract top choice output for each word
 *   .map file     - showing how the .txt file has been rejected in the .ep file
 *   epchoice list - a list of one element per word, containing the text for the
 *                   epaper. Reject strings are inserted.
 *   inset list    - a list of bounding boxes of reject insets - indexed by the
 *                   reject strings in the epchoice text.
 *************************************************************************/
void Tesseract::write_results(PAGE_RES_IT &page_res_it,
                              char newline_type,  // type of newline
                              BOOL8 force_eol) {  // override tilde crunch?
  WERD_RES *word = page_res_it.word();
  const UNICHARSET &uchset = *word->uch_set;
  STRING repetition_code;
  const STRING *wordstr;
  STRING wordstr_lengths;
  int i;
  char unrecognised = STRING (unrecognised_char)[0];
  char ep_chars[32];             //Only for unlv_tilde_crunch
  int ep_chars_index = 0;
  char txt_chs[32];              //Only for unlv_tilde_crunch
  char map_chs[32];              //Only for unlv_tilde_crunch
  int txt_index = 0;
  BOOL8 need_reject = FALSE;
  UNICHAR_ID space = uchset.unichar_to_id(" ");
  if ((word->unlv_crunch_mode != CR_NONE ||
       word->best_choice->length() == 0) &&
      !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
    if ((word->unlv_crunch_mode != CR_DELETE) &&
        (!stats_.tilde_crunch_written ||
         ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
          (word->word->space () > 0) &&
          !word->word->flag (W_FUZZY_NON) &&
          !word->word->flag (W_FUZZY_SP)))) {
      if (!word->word->flag (W_BOL) &&
          (word->word->space () > 0) &&
          !word->word->flag (W_FUZZY_NON) &&
          !word->word->flag (W_FUZZY_SP)) {
        // Write a space to separate from preceeding good text.
        txt_chs[txt_index] = ' ';
        map_chs[txt_index++] = '1';
        ep_chars[ep_chars_index++] = ' ';
        stats_.last_char_was_tilde = false;
      }
      need_reject = TRUE;
    }
    if ((need_reject && !stats_.last_char_was_tilde) ||
        (force_eol && stats_.write_results_empty_block)) {
      /* Write a reject char - mark as rejected unless zero_rejection mode */
      stats_.last_char_was_tilde = TRUE;
      txt_chs[txt_index] = unrecognised;
      if (tessedit_zero_rejection || (suspect_level == 0)) {
        map_chs[txt_index++] = '1';
        ep_chars[ep_chars_index++] = unrecognised;
      }
      else {
        map_chs[txt_index++] = '0';
        /*
           The ep_choice string is a faked reject to allow newdiff to sync the
           .etx with the .txt and .map files.
         */
        ep_chars[ep_chars_index++] = CTRL_INSET; // escape code
                                 //dummy reject
        ep_chars[ep_chars_index++] = 1;
                                 //dummy reject
        ep_chars[ep_chars_index++] = 1;
                                 //type
        ep_chars[ep_chars_index++] = 2;
                                 //dummy reject
        ep_chars[ep_chars_index++] = 1;
                                 //dummy reject
        ep_chars[ep_chars_index++] = 1;
      }
      stats_.tilde_crunch_written = true;
      stats_.last_char_was_newline = false;
      stats_.write_results_empty_block = false;
    }

    if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
      /* Add a new line output */
      txt_chs[txt_index] = '\n';
      map_chs[txt_index++] = '\n';
                                 //end line
      ep_chars[ep_chars_index++] = newline_type;

                                 //Cos of the real newline
      stats_.tilde_crunch_written = false;
      stats_.last_char_was_newline = true;
      stats_.last_char_was_tilde = false;
    }
    txt_chs[txt_index] = '\0';
    map_chs[txt_index] = '\0';
    ep_chars[ep_chars_index] = '\0';  // terminate string
    word->ep_choice = new WERD_CHOICE(ep_chars, uchset);

    if (force_eol)
      stats_.write_results_empty_block = true;
    return;
  }

  /* NORMAL PROCESSING of non tilde crunched words */

  stats_.tilde_crunch_written = false;
  if (newline_type)
    stats_.last_char_was_newline = true;
  else
    stats_.last_char_was_newline = false;
  stats_.write_results_empty_block = force_eol;  // about to write a real word

  if (unlv_tilde_crunching &&
      stats_.last_char_was_tilde &&
      (word->word->space() == 0) &&
      !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
      (word->best_choice->unichar_id(0) == space)) {
    /* Prevent adjacent tilde across words - we know that adjacent tildes within
       words have been removed */
    word->best_choice->remove_unichar_id(0);
    if (word->best_choice->blob_choices() != NULL) {
      BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices());
      if (!blob_choices_it.empty()) delete blob_choices_it.extract();
    }
    word->reject_map.remove_pos (0);
    word->box_word->DeleteBox(0);
  }
  if (newline_type ||
    (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
    stats_.last_char_was_tilde = false;
  else {
    if (word->reject_map.length () > 0) {
      if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
        stats_.last_char_was_tilde = true;
      else
        stats_.last_char_was_tilde = false;
    }
    else if (word->word->space () > 0)
      stats_.last_char_was_tilde = false;
    /* else it is unchanged as there are no output chars */
  }

  ASSERT_HOST (word->best_choice->length() == word->reject_map.length());

  set_unlv_suspects(word);
  check_debug_pt (word, 120);
  if (tessedit_rejection_debug) {
    tprintf ("Dict word: \"%s\": %d\n",
             word->best_choice->debug_string().string(),
             dict_word(*(word->best_choice)));
  }
  if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
    repetition_code = "|^~R";
    wordstr_lengths = "\001\001\001\001";
    repetition_code += uchset.id_to_unichar(get_rep_char(word));
    wordstr_lengths += strlen(uchset.id_to_unichar(get_rep_char(word)));
    wordstr = &repetition_code;
  } else {
    if (tessedit_zero_rejection) {
      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
      for (i = 0; i < word->best_choice->length(); ++i) {
        if (word->reject_map[i].rejected())
          word->reject_map[i].setrej_minimal_rej_accept();
      }
    }
    if (tessedit_minimal_rejection) {
      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
      for (i = 0; i < word->best_choice->length(); ++i) {
        if ((word->best_choice->unichar_id(i) != space) &&
            word->reject_map[i].rejected())
          word->reject_map[i].setrej_minimal_rej_accept();
      }
    }
  }
}