Пример #1
0
/*************************************************************************
 * make_reject_map()
 *
 * Sets the done flag to indicate whether the resylt is acceptable.
 *
 * Sets a reject map for the word.
 *************************************************************************/
void Tesseract::make_reject_map(WERD_RES *word, ROW *row, inT16 pass) {
  int i;
  int offset;

  flip_0O(word);
  check_debug_pt(word, -1);     // For trap only
  set_done(word, pass);  // Set acceptance
  word->reject_map.initialise(word->best_choice->unichar_lengths().length());
  reject_blanks(word);
  /*
  0: Rays original heuristic - the baseline
  */
  if (tessedit_reject_mode == 0) {
    if (!word->done)
      reject_poor_matches(word);
  } else if (tessedit_reject_mode == 5) {
    /*
    5: Reject I/1/l from words where there is no strong contextual confirmation;
      the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
      and the whole of any words which are very small
    */
    if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
      word->reject_map.rej_word_small_xht();
    } else {
      one_ell_conflict(word, TRUE);
      /*
        Originally the code here just used the done flag. Now I have duplicated
        and unpacked the conditions for setting the done flag so that each
        mechanism can be turned on or off independently. This works WITHOUT
        affecting the done flag setting.
      */
      if (rej_use_tess_accepted && !word->tess_accepted)
        word->reject_map.rej_word_not_tess_accepted ();

      if (rej_use_tess_blanks &&
        (strchr (word->best_choice->unichar_string().string (), ' ') != NULL))
        word->reject_map.rej_word_contains_blanks ();

      WERD_CHOICE* best_choice = word->best_choice;
      if (rej_use_good_perm) {
        if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
             best_choice->permuter() == FREQ_DAWG_PERM ||
             best_choice->permuter() == USER_DAWG_PERM) &&
            (!rej_use_sensible_wd ||
             acceptable_word_string(*word->uch_set,
                                    best_choice->unichar_string().string(),
                                    best_choice->unichar_lengths().string()) !=
                                        AC_UNACCEPTABLE)) {
          // PASSED TEST
        } else if (best_choice->permuter() == NUMBER_PERM) {
          if (rej_alphas_in_number_perm) {
            for (i = 0, offset = 0;
                 best_choice->unichar_string()[offset] != '\0';
                 offset += best_choice->unichar_lengths()[i++]) {
              if (word->reject_map[i].accepted() &&
                  word->uch_set->get_isalpha(
                      best_choice->unichar_string().string() + offset,
                      best_choice->unichar_lengths()[i]))
                word->reject_map[i].setrej_bad_permuter();
              // rej alpha
            }
          }
        } else {
          word->reject_map.rej_word_bad_permuter();
        }
      }
      /* Ambig word rejection was here once !!*/
    }
  } else {
    tprintf("BAD tessedit_reject_mode\n");
    err_exit();
  }

  if (tessedit_image_border > -1)
    reject_edge_blobs(word);

  check_debug_pt (word, 10);
  if (tessedit_rejection_debug) {
    tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
    tprintf("Certainty: %f     Rating: %f\n",
      word->best_choice->certainty (), word->best_choice->rating ());
    tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
  }

  flip_hyphens(word);
  check_debug_pt(word, 20);
}
Пример #2
0
/*************************************************************************
 * SUSPECT LEVELS
 *
 * 0 - dont reject ANYTHING
 * 1,2 - partial rejection
 * 3 - BEST
 *
 * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
 * tessedit_minimal_rejection.
 *************************************************************************/
    void Tesseract::set_unlv_suspects(WERD_RES * word_res) {
        int len = word_res->reject_map.length();
        const WERD_CHOICE &word = *(word_res->best_choice);
        const UNICHARSET &uchset = *word.unicharset();
        int i;
        float rating_per_ch;

        if (suspect_level == 0) {
            for (i = 0; i < len; i++) {
                if (word_res->reject_map[i].rejected())
                    word_res->reject_map[i].setrej_minimal_rej_accept();
            }
            return;
        }

        if (suspect_level >= 3)
            return;                      //Use defaults

        /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/

        if (safe_dict_word(word_res) &&
            (count_alphas(word) > suspect_short_words)) {
            /* Unreject alphas in dictionary words */
            for (i = 0; i < len; ++i) {
                if (word_res->reject_map[i].rejected() &&
                    uchset.get_isalpha(word.unichar_id(i)))
                    word_res->reject_map[i].setrej_minimal_rej_accept();
            }
        }

        rating_per_ch = word.rating() / word_res->reject_map.length();

        if (rating_per_ch >= suspect_rating_per_ch)
            return;                      //Dont touch bad ratings

        if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
            /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
            for (i = 0; i < len; ++i) {
                if (word_res->reject_map[i].rejected() &&
                    (!uchset.eq(word.unichar_id(i), " ")))
                    word_res->reject_map[i].setrej_minimal_rej_accept();
            }
        }

        for (i = 0; i < len; i++) {
            if (word_res->reject_map[i].rejected()) {
                if (word_res->reject_map[i].flag(R_DOC_REJ))
                    word_res->reject_map[i].setrej_minimal_rej_accept();
                if (word_res->reject_map[i].flag(R_BLOCK_REJ))
                    word_res->reject_map[i].setrej_minimal_rej_accept();
                if (word_res->reject_map[i].flag(R_ROW_REJ))
                    word_res->reject_map[i].setrej_minimal_rej_accept();
            }
        }

        if (suspect_level == 2)
            return;

        if (!suspect_constrain_1Il ||
            (word_res->reject_map.length() <= suspect_short_words)) {
            for (i = 0; i < len; i++) {
                if (word_res->reject_map[i].rejected()) {
                    if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
                         word_res->reject_map[i].flag(R_POSTNN_1IL)))
                        word_res->reject_map[i].setrej_minimal_rej_accept();

                    if (!suspect_constrain_1Il &&
                        word_res->reject_map[i].flag(R_MM_REJECT))
                        word_res->reject_map[i].setrej_minimal_rej_accept();
                }
            }
        }

        if (acceptable_word_string(*word_res->uch_set,
                                   word.unichar_string().string(),
                                   word.unichar_lengths().string()) !=
            AC_UNACCEPTABLE ||
            acceptable_number_string(word.unichar_string().string(),
                                     word.unichar_lengths().string())) {
            if (word_res->reject_map.length() > suspect_short_words) {
                for (i = 0; i < len; i++) {
                    if (word_res->reject_map[i].rejected() &&
                        (!word_res->reject_map[i].perm_rejected() ||
                         word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
                         word_res->reject_map[i].flag(R_POSTNN_1IL) ||
                         word_res->reject_map[i].flag(R_MM_REJECT))) {
                        word_res->reject_map[i].setrej_minimal_rej_accept();
                    }
                }
            }
        }
    }
Пример #3
0
/**********************************************************************
 * one_ell_conflict()
 *
 * Identify words where there is a potential I/l/1 error.
 * - A bundle of contextual heuristics!
 **********************************************************************/
BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) {
  const char *word;
  const char *lengths;
  inT16 word_len;                //its length
  inT16 first_alphanum_index_;
  inT16 first_alphanum_offset_;
  inT16 i;
  inT16 offset;
  BOOL8 non_conflict_set_char;   //non conf set a/n?
  BOOL8 conflict = FALSE;
  BOOL8 allow_1s;
  ACCEPTABLE_WERD_TYPE word_type;
  BOOL8 dict_perm_type;
  BOOL8 dict_word_ok;
  int dict_word_type;

  word = word_res->best_choice->unichar_string().string ();
  lengths = word_res->best_choice->unichar_lengths().string();
  word_len = strlen (lengths);
  /*
    If there are no occurrences of the conflict set characters then the word
    is OK.
  */
  if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL)
    return FALSE;

  /*
    There is a conflict if there are NO other (confirmed) alphanumerics apart
    from those in the conflict set.
  */

  for (i = 0, offset = 0, non_conflict_set_char = FALSE;
       (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
    non_conflict_set_char =
        (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
            word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
        !STRING (conflict_set_I_l_1).contains (word[offset]);
  if (!non_conflict_set_char) {
    if (update_map)
      reject_I_1_L(word_res);
    return TRUE;
  }

  /*
    If the word is accepted by a dawg permuter, and the first alpha character
    is "I" or "l", check to see if the alternative is also a dawg word. If it
    is, then there is a potential error otherwise the word is ok.
  */

  dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
    (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
    (rej_trust_doc_dawg &&
    (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
    (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
  dict_word_type = dict_word(*(word_res->best_choice));
  dict_word_ok = (dict_word_type > 0) &&
    (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));

  if ((rej_1Il_use_dict_word && dict_word_ok) ||
    (rej_1Il_trust_permuter_type && dict_perm_type) ||
  (dict_perm_type && dict_word_ok)) {
    first_alphanum_index_ = first_alphanum_index (word, lengths);
    first_alphanum_offset_ = first_alphanum_offset (word, lengths);
    if (lengths[first_alphanum_index_] == 1 &&
        word[first_alphanum_offset_] == 'I') {
      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
      if (safe_dict_word(word_res) > 0) {
        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
        if (update_map)
          word_res->reject_map[first_alphanum_index_].
            setrej_1Il_conflict();
        return TRUE;
      }
      else {
        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
        return FALSE;
      }
    }

    if (lengths[first_alphanum_index_] == 1 &&
        word[first_alphanum_offset_] == 'l') {
      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
      if (safe_dict_word(word_res) > 0) {
        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
        if (update_map)
          word_res->reject_map[first_alphanum_index_].
            setrej_1Il_conflict();
        return TRUE;
      }
      else {
        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
        return FALSE;
      }
    }
    return FALSE;
  }

  /*
    NEW 1Il code. The old code relied on permuter types too much. In fact,
    tess will use TOP_CHOICE permute for good things like "palette".
    In this code the string is examined independently to see if it looks like
    a well formed word.
  */

  /*
    REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
    dictionary word.
  */
  first_alphanum_index_ = first_alphanum_index (word, lengths);
  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
  if (lengths[first_alphanum_index_] == 1 &&
      word[first_alphanum_offset_] == 'l') {
    word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
    if (safe_dict_word(word_res) > 0)
      return FALSE;
    else
      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
  }
  else if (lengths[first_alphanum_index_] == 1 &&
           word[first_alphanum_offset_] == 'I') {
    word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
    if (safe_dict_word(word_res) > 0)
      return FALSE;
    else
      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
  }
  /*
    For strings containing digits:
      If there are no alphas OR the numeric permuter liked the word,
        reject any non 1 conflict chs
      Else reject all conflict chs
  */
  if (word_contains_non_1_digit (word, lengths)) {
    allow_1s = (alpha_count (word, lengths) == 0) ||
      (word_res->best_choice->permuter () == NUMBER_PERM);

    inT16 offset;
    conflict = FALSE;
    for (i = 0, offset = 0; word[offset] != '\0';
         offset += word_res->best_choice->unichar_lengths()[i++]) {
      if ((!allow_1s || (word[offset] != '1')) &&
      STRING (conflict_set_I_l_1).contains (word[offset])) {
        if (update_map)
          word_res->reject_map[i].setrej_1Il_conflict ();
        conflict = TRUE;
      }
    }
    return conflict;
  }
  /*
    For anything else. See if it conforms to an acceptable word type. If so,
    treat accordingly.
  */
  word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
  if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
    first_alphanum_index_ = first_alphanum_index (word, lengths);
    first_alphanum_offset_ = first_alphanum_offset (word, lengths);
    if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
      if (update_map)
        word_res->reject_map[first_alphanum_index_].
            setrej_1Il_conflict ();
      return TRUE;
    }
    else
      return FALSE;
  }
  else if (word_type == AC_UPPER_CASE) {
    return FALSE;
  }
  else {
    if (update_map)
      reject_I_1_L(word_res);
    return TRUE;
  }
}