コード例 #1
0
ファイル: tfacepp.cpp プロジェクト: 0ximDigital/appsScanner
void Tesseract::recog_word(WERD_RES *word) {
  if (wordrec_skip_no_truth_words && (word->blamer_bundle == NULL ||
      word->blamer_bundle->incorrect_result_reason() == IRR_NO_TRUTH)) {
    if (classify_debug_level) tprintf("No truth for word - skipping\n");
    word->tess_failed = true;
    return;
  }
  ASSERT_HOST(!word->chopped_word->blobs.empty());
  recog_word_recursive(word);
  word->SetupBoxWord();
  if (word->best_choice->length() != word->box_word->length()) {
    tprintf("recog_word ASSERT FAIL String:\"%s\"; "
            "Strlen=%d; #Blobs=%d\n",
            word->best_choice->debug_string().string(),
            word->best_choice->length(), word->box_word->length());
  }
  ASSERT_HOST(word->best_choice->length() == word->box_word->length());
  // Check that the ratings matrix size matches the sum of all the
  // segmentation states.
  if (!word->StatesAllValid()) {
    tprintf("Not all words have valid states relative to ratings matrix!!");
    word->DebugWordChoices(true, NULL);
    ASSERT_HOST(word->StatesAllValid());
  }
  if (tessedit_override_permuter) {
    /* Override the permuter type if a straight dictionary check disagrees. */
    uinT8 perm_type = word->best_choice->permuter();
    if ((perm_type != SYSTEM_DAWG_PERM) &&
        (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
      uinT8 real_dict_perm_type = dict_word(*word->best_choice);
      if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
           (real_dict_perm_type == FREQ_DAWG_PERM) ||
           (real_dict_perm_type == USER_DAWG_PERM)) &&
          (alpha_count(word->best_choice->unichar_string().string(),
                       word->best_choice->unichar_lengths().string()) > 0)) {
        word->best_choice->set_permuter(real_dict_perm_type);  // use dict perm
      }
    }
    if (tessedit_rejection_debug &&
        perm_type != word->best_choice->permuter()) {
      tprintf("Permuter Type Flipped from %d to %d\n",
              perm_type, word->best_choice->permuter());
    }
  }
  // Factored out from control.cpp
  ASSERT_HOST((word->best_choice == NULL) == (word->raw_choice == NULL));
  if (word->best_choice == NULL || word->best_choice->length() == 0 ||
      static_cast<int>(strspn(word->best_choice->unichar_string().string(),
                              " ")) == word->best_choice->length()) {
    word->tess_failed = true;
    word->reject_map.initialise(word->box_word->length());
    word->reject_map.rej_word_tess_failure();
  } else {
    word->tess_failed = false;
  }
}
コード例 #2
0
ファイル: output.cpp プロジェクト: mehulsbhatt/MyOCRTEST
/*************************************************************************
 * write_results()
 *
 * All recognition and rejection has now been done. Generate the following:
 *   .txt file     - giving the final best choices with NO highlighting
 *   .raw file     - giving the tesseract top choice output for each word
 *   .map file     - showing how the .txt file has been rejected in the .ep file
 *   epchoice list - a list of one element per word, containing the text for the
 *                   epaper. Reject strings are inserted.
 *   inset list    - a list of bounding boxes of reject insets - indexed by the
 *                   reject strings in the epchoice text.
 *************************************************************************/
    void Tesseract::write_results(PAGE_RES_IT &page_res_it,
                                  char newline_type,  // type of newline
                                  BOOL8 force_eol) {  // override tilde crunch?
        WERD_RES *word = page_res_it.word();
        const UNICHARSET &uchset = *word->uch_set;
        int i;
        BOOL8 need_reject = FALSE;
        UNICHAR_ID space = uchset.unichar_to_id(" ");

        if ((word->unlv_crunch_mode != CR_NONE ||
             word->best_choice->length() == 0) &&
            !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
            if ((word->unlv_crunch_mode != CR_DELETE) &&
                (!stats_.tilde_crunch_written ||
                 ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
                  (word->word->space() > 0) &&
                  !word->word->flag(W_FUZZY_NON) &&
                  !word->word->flag(W_FUZZY_SP)))) {
                if (!word->word->flag(W_BOL) &&
                    (word->word->space() > 0) &&
                    !word->word->flag(W_FUZZY_NON) &&
                    !word->word->flag(W_FUZZY_SP)) {
                    stats_.last_char_was_tilde = false;
                }
                need_reject = TRUE;
            }
            if ((need_reject && !stats_.last_char_was_tilde) ||
                (force_eol && stats_.write_results_empty_block)) {
                /* Write a reject char - mark as rejected unless zero_rejection mode */
                stats_.last_char_was_tilde = TRUE;
                stats_.tilde_crunch_written = true;
                stats_.last_char_was_newline = false;
                stats_.write_results_empty_block = false;
            }

            if ((word->word->flag(W_EOL) && !stats_.last_char_was_newline) || force_eol) {
                stats_.tilde_crunch_written = false;
                stats_.last_char_was_newline = true;
                stats_.last_char_was_tilde = false;
            }

            if (force_eol)
                stats_.write_results_empty_block = true;
            return;
        }

        /* NORMAL PROCESSING of non tilde crunched words */

        stats_.tilde_crunch_written = false;
        if (newline_type)
            stats_.last_char_was_newline = true;
        else
            stats_.last_char_was_newline = false;
        stats_.write_results_empty_block = force_eol;  // about to write a real word

        if (unlv_tilde_crunching &&
            stats_.last_char_was_tilde &&
            (word->word->space() == 0) &&
            !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
            (word->best_choice->unichar_id(0) == space)) {
            /* Prevent adjacent tilde across words - we know that adjacent tildes within
               words have been removed */
            word->MergeAdjacentBlobs(0);
        }
        if (newline_type ||
            (word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes))
            stats_.last_char_was_tilde = false;
        else {
            if (word->reject_map.length() > 0) {
                if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
                    stats_.last_char_was_tilde = true;
                else
                    stats_.last_char_was_tilde = false;
            }
            else if (word->word->space() > 0)
                stats_.last_char_was_tilde = false;
            /* else it is unchanged as there are no output chars */
        }

        ASSERT_HOST(word->best_choice->length() == word->reject_map.length());

        set_unlv_suspects(word);
        check_debug_pt(word, 120);
        if (tessedit_rejection_debug) {
            tprintf("Dict word: \"%s\": %d\n",
                    word->best_choice->debug_string().string(),
                    dict_word(*(word->best_choice)));
        }
        if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
            if (tessedit_zero_rejection) {
                /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
                for (i = 0; i < word->best_choice->length(); ++i) {
                    if (word->reject_map[i].rejected())
                        word->reject_map[i].setrej_minimal_rej_accept();
                }
            }
            if (tessedit_minimal_rejection) {
                /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
                for (i = 0; i < word->best_choice->length(); ++i) {
                    if ((word->best_choice->unichar_id(i) != space) &&
                        word->reject_map[i].rejected())
                        word->reject_map[i].setrej_minimal_rej_accept();
                }
            }
        }
    }
コード例 #3
0
ファイル: tfacepp.cpp プロジェクト: chanchai/botker
/**********************************************************************
 * recog_word
 *
 * Convert the word to tess form and pass it to the tess segmenter.
 * Convert the output back to editor form.
 **********************************************************************/
WERD_CHOICE *recog_word(                           //recog one owrd
                        WERD *word,                //word to do
                        DENORM *denorm,            //de-normaliser
                        POLY_MATCHER matcher,      //matcher function
                        POLY_TESTER tester,        //tester function
                        POLY_TESTER trainer,       //trainer function
                        BOOL8 testing,             //true if answer driven
                        WERD_CHOICE *&raw_choice,  //raw result //list of blob lists
                        BLOB_CHOICE_LIST_CLIST *blob_choices,
                        WERD *&outword             //bln word output
                       ) {
  WERD_CHOICE *word_choice;
  uinT8 perm_type;
  uinT8 real_dict_perm_type;

  if (word->blob_list ()->empty ()) {
    char empty_lengths[] = {0};
    word_choice = new WERD_CHOICE ("", empty_lengths,
                                   10.0f, -1.0f, TOP_CHOICE_PERM);
    raw_choice = new WERD_CHOICE ("", empty_lengths,
                                  10.0f, -1.0f, TOP_CHOICE_PERM);
    outword = word->poly_copy (denorm->row ()->x_height ());
  }
  else
    word_choice = recog_word_recursive (word, denorm, matcher, tester,
      trainer, testing, raw_choice,
      blob_choices, outword);
  if ((word_choice->lengths ().length () !=
    outword->blob_list ()->length ()) ||
  (word_choice->lengths ().length () != blob_choices->length ())) {
    tprintf
      ("recog_word ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
      word_choice->string ().string (), word_choice->lengths ().length (),
      outword->blob_list ()->length (), blob_choices->length ());
  }
  ASSERT_HOST (word_choice->lengths ().length () ==
    outword->blob_list ()->length ());
  ASSERT_HOST (word_choice->lengths ().length () == blob_choices->length ());

  /* Copy any reject blobs into the outword */
  outword->rej_blob_list ()->deep_copy (word->rej_blob_list ());

  if (tessedit_override_permuter) {
    /* Override the permuter type if a straight dictionary check disagrees. */
    perm_type = word_choice->permuter ();
    if ((perm_type != SYSTEM_DAWG_PERM) &&
    (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
      real_dict_perm_type = dict_word (word_choice->string ().string ());
      if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
        (real_dict_perm_type == FREQ_DAWG_PERM) ||
        (real_dict_perm_type == USER_DAWG_PERM)) &&
        (alpha_count (word_choice->string ().string (),
                      word_choice->lengths ().string ()) > 0))
        word_choice->set_permuter (real_dict_perm_type);
      //Use dict perm
    }
    if (tessedit_rejection_debug && perm_type != word_choice->permuter ()) {
      tprintf ("Permuter Type Flipped from %d to %d\n",
        perm_type, word_choice->permuter ());
    }
  }
  assert ((word_choice == NULL) == (raw_choice == NULL));
  return word_choice;
}
コード例 #4
0
ファイル: decompressor.c プロジェクト: danyf90/lz78
int64_t decompress(const char* in_filename, const char* out_filename, uint8_t flags) {

    struct bitio		*bd = bstdin;
    struct dictionary	*d = NULL;
    struct utimbuf		*t = NULL;
    FILE*				fout = stdout;
    char				*out_file = NULL;
    uint8_t				bits, initial_bits, meta_type, meta_size;
    uint16_t			c;
    uint32_t			bitMask, cur, first_record, len, next_record, dict_size = 0, written = 0, write_count = 0;
    uint64_t			filesize = 0;
    char				*word;
    int					first = 1, md5c_size = 0, md5d_size = 0;
    void				*meta_data, *md5c = NULL, *md5d = NULL;
    EVP_MD_CTX			*md_ctx = NULL;

    if (in_filename != NULL) {
        bd = bitio_open(in_filename, 'r');
        if (bd == NULL)
            goto error;
    }

    //read metadata
    while ((meta_data = meta_read(bd, &meta_type, &meta_size)) != META_END) {
        LOG("META_TYPE: %d", meta_type);
        switch (meta_type) {
        case META_DICT_SIZE:
            dict_size = *(uint32_t*)meta_data;
            PRINT(1, "Dictionary Size:\t%d\n", dict_size);
            break;

        case META_NAME:
            PRINT(1, "Original file name:\t%s\n", (char*)meta_data);
            if (flags & DEC_ORIG_FILENAME) {
                out_file = malloc(meta_size);
                if (out_file == NULL)
                    goto error;
                memcpy((void*)out_file, meta_data, meta_size);
                out_filename = out_file;
            }
            break;

        case META_MD5:
            md5c = malloc(meta_size);
            memcpy(md5c, meta_data, meta_size);
            md5c_size = meta_size;
            word = sprinth(md5c, md5c_size);
            PRINT(1, "Original md5sum:\t%s\n", word);
            free(word);
            // initialize md context
            OpenSSL_add_all_digests();
            md_ctx = malloc(sizeof(EVP_MD_CTX));
            EVP_MD_CTX_init(md_ctx);
            EVP_DigestInit(md_ctx, EVP_get_digestbyname("md5"));
            md5d_size = EVP_MD_CTX_size(md_ctx);
            md5d = malloc(md5d_size);
            break;

        case META_TIMESTAMP:
            t = malloc(sizeof(*t));
            t->actime = *((time_t*)meta_data); // access time
            t->modtime = *((time_t*)meta_data); // modification time
            break;

        default: // META_ERROR
            LOG("Unknown metadata");
            errno = EINVAL;
            goto error;
        }
        free(meta_data);
    }

    if ((flags & DEC_ORIG_FILENAME) && out_file == NULL) // if i have DEC_ORIG_FILENAME setted but no info in metadata i use stdin as outfile
        out_filename = "stdin";

    if (out_filename != NULL) {
        fout = fopen(out_filename, "w");
        if (fout == NULL)
            goto error;
    }

    if (out_filename != NULL && in_filename != NULL && strcmp(in_filename, out_filename) == 0) {
        errno = EINVAL;
        goto error;
    }

    if (dict_size == 0)
        goto error;

    d = dict_new(dict_size, 0, dict_size, NUM_SYMBOLS);

    if (d == NULL)
        goto error;

    first_record = dict_init(d);
    next_record = first_record;
    initial_bits = 0;
    bitMask = 1;
    while (bitMask < next_record) {
        bitMask <<= 1;
        initial_bits++;
    }
    bits = initial_bits;

    for (;;) {
        // put in cur the index of the fetched word in the dictionary
        cur = fetch(bd, bits);
        if (cur == ROOT_NODE)
            goto error;

        if (cur == EOF_SYMBOL)
            break;

        c = dict_first_symbol(d, cur);

        if (c == EOF_SYMBOL)
            goto error;

        if (!first) {
            // complete previous record with index of new record
            // ROOT_NODE as current node value means 'don't change it'.
            dict_fill(d, next_record, ROOT_NODE, (uint8_t) c, 0);
            next_record++;
            if ((next_record+1) & bitMask) {
                bitMask <<= 1;
                bits++;
            }
        }
        else
            first = 0;

        // get the word in the dictionary at index cur.
        word = dict_word(d, cur, &len);
        if (word == NULL)
            goto error;

        written = fwrite(word, 1, len, fout);

        if (written < len)
            goto error;
        else { // md5 computation and visual feedback

            if (md5c != NULL) // compute md5 of decompressed
                EVP_DigestUpdate(md_ctx, word, len);

            write_count += written;
            if (write_count >= COUNT_THRESHOLD) {
                filesize += write_count;
                write_count = 0;
                PRINT(1, ".");
            }
        }

        if (next_record + 1 == dict_size) {

            next_record = first_record;

            bits = initial_bits;
            bitMask = 1 << bits;

            first = 1; // set first iteration to be the next
        }

        // add a new record
        dict_fill(d, next_record, cur, 0, 0); // symbol will be filled at the beginning of next iteration

    }

    filesize += write_count;

    if (md5c != NULL) {
        EVP_DigestFinal_ex(md_ctx, md5d, (unsigned int*)&md5d_size);

        if (md5c_size == md5d_size && memcmp(md5c, md5d, md5c_size) == 0)
            PRINT(1, "\nmd5sum Check:\t\tOK");
        else {
            PRINT(1, "\nmd5sum Check:\t\tFailed");
            goto error;
        }
    }

    PRINT(1, "\nDecompression Finished\n\n");

    fclose(fout);
    if (out_file != NULL && t != NULL)
        if (utime(out_filename, t) < 0) { // set modification time
            PRINT(1, "Error while changing last modification time");
        }
    free(out_file);
    free(t);
    dict_delete(d);
    bitio_flush(bd);
    if (bd != bstdin)
        bitio_close(bd);
    return filesize;

error:
    PRINT(1, "\n");
    if (out_filename != NULL)
        unlink(out_filename);
    free(out_file);
    free(t);
    dict_delete(d);
    bitio_flush(bd);
    if (bd != bstdin)
        bitio_close(bd);
    if (fout != NULL)
        fclose(fout);
    return -1;
}
コード例 #5
0
ファイル: output.cpp プロジェクト: 0xkasun/Dummy_Tes
/*************************************************************************
 * write_results()
 *
 * All recognition and rejection has now been done. Generate the following:
 *   .txt file     - giving the final best choices with NO highlighting
 *   .raw file     - giving the tesseract top choice output for each word
 *   .map file     - showing how the .txt file has been rejected in the .ep file
 *   epchoice list - a list of one element per word, containing the text for the
 *                   epaper. Reject strings are inserted.
 *   inset list    - a list of bounding boxes of reject insets - indexed by the
 *                   reject strings in the epchoice text.
 *************************************************************************/
void Tesseract::write_results(PAGE_RES_IT &page_res_it,
                              char newline_type,  // type of newline
                              BOOL8 force_eol) {  // override tilde crunch?
  WERD_RES *word = page_res_it.word();
  const UNICHARSET &uchset = *word->uch_set;
  STRING repetition_code;
  const STRING *wordstr;
  STRING wordstr_lengths;
  int i;
  char unrecognised = STRING (unrecognised_char)[0];
  char ep_chars[32];             //Only for unlv_tilde_crunch
  int ep_chars_index = 0;
  char txt_chs[32];              //Only for unlv_tilde_crunch
  char map_chs[32];              //Only for unlv_tilde_crunch
  int txt_index = 0;
  BOOL8 need_reject = FALSE;
  UNICHAR_ID space = uchset.unichar_to_id(" ");
  if ((word->unlv_crunch_mode != CR_NONE ||
       word->best_choice->length() == 0) &&
      !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
    if ((word->unlv_crunch_mode != CR_DELETE) &&
        (!stats_.tilde_crunch_written ||
         ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
          (word->word->space () > 0) &&
          !word->word->flag (W_FUZZY_NON) &&
          !word->word->flag (W_FUZZY_SP)))) {
      if (!word->word->flag (W_BOL) &&
          (word->word->space () > 0) &&
          !word->word->flag (W_FUZZY_NON) &&
          !word->word->flag (W_FUZZY_SP)) {
        // Write a space to separate from preceeding good text.
        txt_chs[txt_index] = ' ';
        map_chs[txt_index++] = '1';
        ep_chars[ep_chars_index++] = ' ';
        stats_.last_char_was_tilde = false;
      }
      need_reject = TRUE;
    }
    if ((need_reject && !stats_.last_char_was_tilde) ||
        (force_eol && stats_.write_results_empty_block)) {
      /* Write a reject char - mark as rejected unless zero_rejection mode */
      stats_.last_char_was_tilde = TRUE;
      txt_chs[txt_index] = unrecognised;
      if (tessedit_zero_rejection || (suspect_level == 0)) {
        map_chs[txt_index++] = '1';
        ep_chars[ep_chars_index++] = unrecognised;
      }
      else {
        map_chs[txt_index++] = '0';
        /*
           The ep_choice string is a faked reject to allow newdiff to sync the
           .etx with the .txt and .map files.
         */
        ep_chars[ep_chars_index++] = CTRL_INSET; // escape code
                                 //dummy reject
        ep_chars[ep_chars_index++] = 1;
                                 //dummy reject
        ep_chars[ep_chars_index++] = 1;
                                 //type
        ep_chars[ep_chars_index++] = 2;
                                 //dummy reject
        ep_chars[ep_chars_index++] = 1;
                                 //dummy reject
        ep_chars[ep_chars_index++] = 1;
      }
      stats_.tilde_crunch_written = true;
      stats_.last_char_was_newline = false;
      stats_.write_results_empty_block = false;
    }

    if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
      /* Add a new line output */
      txt_chs[txt_index] = '\n';
      map_chs[txt_index++] = '\n';
                                 //end line
      ep_chars[ep_chars_index++] = newline_type;

                                 //Cos of the real newline
      stats_.tilde_crunch_written = false;
      stats_.last_char_was_newline = true;
      stats_.last_char_was_tilde = false;
    }
    txt_chs[txt_index] = '\0';
    map_chs[txt_index] = '\0';
    ep_chars[ep_chars_index] = '\0';  // terminate string
    word->ep_choice = new WERD_CHOICE(ep_chars, uchset);

    if (force_eol)
      stats_.write_results_empty_block = true;
    return;
  }

  /* NORMAL PROCESSING of non tilde crunched words */

  stats_.tilde_crunch_written = false;
  if (newline_type)
    stats_.last_char_was_newline = true;
  else
    stats_.last_char_was_newline = false;
  stats_.write_results_empty_block = force_eol;  // about to write a real word

  if (unlv_tilde_crunching &&
      stats_.last_char_was_tilde &&
      (word->word->space() == 0) &&
      !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
      (word->best_choice->unichar_id(0) == space)) {
    /* Prevent adjacent tilde across words - we know that adjacent tildes within
       words have been removed */
    word->best_choice->remove_unichar_id(0);
    if (word->best_choice->blob_choices() != NULL) {
      BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices());
      if (!blob_choices_it.empty()) delete blob_choices_it.extract();
    }
    word->reject_map.remove_pos (0);
    word->box_word->DeleteBox(0);
  }
  if (newline_type ||
    (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
    stats_.last_char_was_tilde = false;
  else {
    if (word->reject_map.length () > 0) {
      if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
        stats_.last_char_was_tilde = true;
      else
        stats_.last_char_was_tilde = false;
    }
    else if (word->word->space () > 0)
      stats_.last_char_was_tilde = false;
    /* else it is unchanged as there are no output chars */
  }

  ASSERT_HOST (word->best_choice->length() == word->reject_map.length());

  set_unlv_suspects(word);
  check_debug_pt (word, 120);
  if (tessedit_rejection_debug) {
    tprintf ("Dict word: \"%s\": %d\n",
             word->best_choice->debug_string().string(),
             dict_word(*(word->best_choice)));
  }
  if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
    repetition_code = "|^~R";
    wordstr_lengths = "\001\001\001\001";
    repetition_code += uchset.id_to_unichar(get_rep_char(word));
    wordstr_lengths += strlen(uchset.id_to_unichar(get_rep_char(word)));
    wordstr = &repetition_code;
  } else {
    if (tessedit_zero_rejection) {
      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
      for (i = 0; i < word->best_choice->length(); ++i) {
        if (word->reject_map[i].rejected())
          word->reject_map[i].setrej_minimal_rej_accept();
      }
    }
    if (tessedit_minimal_rejection) {
      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
      for (i = 0; i < word->best_choice->length(); ++i) {
        if ((word->best_choice->unichar_id(i) != space) &&
            word->reject_map[i].rejected())
          word->reject_map[i].setrej_minimal_rej_accept();
      }
    }
  }
}
コード例 #6
0
ファイル: reject.cpp プロジェクト: xmarston/BillRecognizer
/*************************************************************************
 * make_reject_map()
 *
 * Sets the done flag to indicate whether the resylt is acceptable.
 *
 * Sets a reject map for the word.
 *************************************************************************/
void Tesseract::make_reject_map(WERD_RES *word, ROW *row, inT16 pass) {
  int i;
  int offset;

  flip_0O(word);
  check_debug_pt(word, -1);     // For trap only
  set_done(word, pass);  // Set acceptance
  word->reject_map.initialise(word->best_choice->unichar_lengths().length());
  reject_blanks(word);
  /*
  0: Rays original heuristic - the baseline
  */
  if (tessedit_reject_mode == 0) {
    if (!word->done)
      reject_poor_matches(word);
  } else if (tessedit_reject_mode == 5) {
    /*
    5: Reject I/1/l from words where there is no strong contextual confirmation;
      the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
      and the whole of any words which are very small
    */
    if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
      word->reject_map.rej_word_small_xht();
    } else {
      one_ell_conflict(word, TRUE);
      /*
        Originally the code here just used the done flag. Now I have duplicated
        and unpacked the conditions for setting the done flag so that each
        mechanism can be turned on or off independently. This works WITHOUT
        affecting the done flag setting.
      */
      if (rej_use_tess_accepted && !word->tess_accepted)
        word->reject_map.rej_word_not_tess_accepted ();

      if (rej_use_tess_blanks &&
        (strchr (word->best_choice->unichar_string().string (), ' ') != NULL))
        word->reject_map.rej_word_contains_blanks ();

      WERD_CHOICE* best_choice = word->best_choice;
      if (rej_use_good_perm) {
        if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
             best_choice->permuter() == FREQ_DAWG_PERM ||
             best_choice->permuter() == USER_DAWG_PERM) &&
            (!rej_use_sensible_wd ||
             acceptable_word_string(*word->uch_set,
                                    best_choice->unichar_string().string(),
                                    best_choice->unichar_lengths().string()) !=
                                        AC_UNACCEPTABLE)) {
          // PASSED TEST
        } else if (best_choice->permuter() == NUMBER_PERM) {
          if (rej_alphas_in_number_perm) {
            for (i = 0, offset = 0;
                 best_choice->unichar_string()[offset] != '\0';
                 offset += best_choice->unichar_lengths()[i++]) {
              if (word->reject_map[i].accepted() &&
                  word->uch_set->get_isalpha(
                      best_choice->unichar_string().string() + offset,
                      best_choice->unichar_lengths()[i]))
                word->reject_map[i].setrej_bad_permuter();
              // rej alpha
            }
          }
        } else {
          word->reject_map.rej_word_bad_permuter();
        }
      }
      /* Ambig word rejection was here once !!*/
    }
  } else {
    tprintf("BAD tessedit_reject_mode\n");
    err_exit();
  }

  if (tessedit_image_border > -1)
    reject_edge_blobs(word);

  check_debug_pt (word, 10);
  if (tessedit_rejection_debug) {
    tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
    tprintf("Certainty: %f     Rating: %f\n",
      word->best_choice->certainty (), word->best_choice->rating ());
    tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
  }

  flip_hyphens(word);
  check_debug_pt(word, 20);
}
コード例 #7
0
ファイル: reject.cpp プロジェクト: xmarston/BillRecognizer
/**********************************************************************
 * one_ell_conflict()
 *
 * Identify words where there is a potential I/l/1 error.
 * - A bundle of contextual heuristics!
 **********************************************************************/
BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) {
  const char *word;
  const char *lengths;
  inT16 word_len;                //its length
  inT16 first_alphanum_index_;
  inT16 first_alphanum_offset_;
  inT16 i;
  inT16 offset;
  BOOL8 non_conflict_set_char;   //non conf set a/n?
  BOOL8 conflict = FALSE;
  BOOL8 allow_1s;
  ACCEPTABLE_WERD_TYPE word_type;
  BOOL8 dict_perm_type;
  BOOL8 dict_word_ok;
  int dict_word_type;

  word = word_res->best_choice->unichar_string().string ();
  lengths = word_res->best_choice->unichar_lengths().string();
  word_len = strlen (lengths);
  /*
    If there are no occurrences of the conflict set characters then the word
    is OK.
  */
  if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL)
    return FALSE;

  /*
    There is a conflict if there are NO other (confirmed) alphanumerics apart
    from those in the conflict set.
  */

  for (i = 0, offset = 0, non_conflict_set_char = FALSE;
       (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
    non_conflict_set_char =
        (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
            word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
        !STRING (conflict_set_I_l_1).contains (word[offset]);
  if (!non_conflict_set_char) {
    if (update_map)
      reject_I_1_L(word_res);
    return TRUE;
  }

  /*
    If the word is accepted by a dawg permuter, and the first alpha character
    is "I" or "l", check to see if the alternative is also a dawg word. If it
    is, then there is a potential error otherwise the word is ok.
  */

  dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
    (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
    (rej_trust_doc_dawg &&
    (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
    (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
  dict_word_type = dict_word(*(word_res->best_choice));
  dict_word_ok = (dict_word_type > 0) &&
    (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));

  if ((rej_1Il_use_dict_word && dict_word_ok) ||
    (rej_1Il_trust_permuter_type && dict_perm_type) ||
  (dict_perm_type && dict_word_ok)) {
    first_alphanum_index_ = first_alphanum_index (word, lengths);
    first_alphanum_offset_ = first_alphanum_offset (word, lengths);
    if (lengths[first_alphanum_index_] == 1 &&
        word[first_alphanum_offset_] == 'I') {
      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
      if (safe_dict_word(word_res) > 0) {
        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
        if (update_map)
          word_res->reject_map[first_alphanum_index_].
            setrej_1Il_conflict();
        return TRUE;
      }
      else {
        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
        return FALSE;
      }
    }

    if (lengths[first_alphanum_index_] == 1 &&
        word[first_alphanum_offset_] == 'l') {
      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
      if (safe_dict_word(word_res) > 0) {
        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
        if (update_map)
          word_res->reject_map[first_alphanum_index_].
            setrej_1Il_conflict();
        return TRUE;
      }
      else {
        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
        return FALSE;
      }
    }
    return FALSE;
  }

  /*
    NEW 1Il code. The old code relied on permuter types too much. In fact,
    tess will use TOP_CHOICE permute for good things like "palette".
    In this code the string is examined independently to see if it looks like
    a well formed word.
  */

  /*
    REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
    dictionary word.
  */
  first_alphanum_index_ = first_alphanum_index (word, lengths);
  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
  if (lengths[first_alphanum_index_] == 1 &&
      word[first_alphanum_offset_] == 'l') {
    word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
    if (safe_dict_word(word_res) > 0)
      return FALSE;
    else
      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
  }
  else if (lengths[first_alphanum_index_] == 1 &&
           word[first_alphanum_offset_] == 'I') {
    word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
    if (safe_dict_word(word_res) > 0)
      return FALSE;
    else
      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
  }
  /*
    For strings containing digits:
      If there are no alphas OR the numeric permuter liked the word,
        reject any non 1 conflict chs
      Else reject all conflict chs
  */
  if (word_contains_non_1_digit (word, lengths)) {
    allow_1s = (alpha_count (word, lengths) == 0) ||
      (word_res->best_choice->permuter () == NUMBER_PERM);

    inT16 offset;
    conflict = FALSE;
    for (i = 0, offset = 0; word[offset] != '\0';
         offset += word_res->best_choice->unichar_lengths()[i++]) {
      if ((!allow_1s || (word[offset] != '1')) &&
      STRING (conflict_set_I_l_1).contains (word[offset])) {
        if (update_map)
          word_res->reject_map[i].setrej_1Il_conflict ();
        conflict = TRUE;
      }
    }
    return conflict;
  }
  /*
    For anything else. See if it conforms to an acceptable word type. If so,
    treat accordingly.
  */
  word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
  if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
    first_alphanum_index_ = first_alphanum_index (word, lengths);
    first_alphanum_offset_ = first_alphanum_offset (word, lengths);
    if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
      if (update_map)
        word_res->reject_map[first_alphanum_index_].
            setrej_1Il_conflict ();
      return TRUE;
    }
    else
      return FALSE;
  }
  else if (word_type == AC_UPPER_CASE) {
    return FALSE;
  }
  else {
    if (update_map)
      reject_I_1_L(word_res);
    return TRUE;
  }
}