Exemple #1
0
void Tesseract::recog_word(WERD_RES *word) {
  if (wordrec_skip_no_truth_words && (word->blamer_bundle == NULL ||
      word->blamer_bundle->incorrect_result_reason() == IRR_NO_TRUTH)) {
    if (classify_debug_level) tprintf("No truth for word - skipping\n");
    word->tess_failed = true;
    return;
  }
  ASSERT_HOST(!word->chopped_word->blobs.empty());
  recog_word_recursive(word);
  word->SetupBoxWord();
  if (word->best_choice->length() != word->box_word->length()) {
    tprintf("recog_word ASSERT FAIL String:\"%s\"; "
            "Strlen=%d; #Blobs=%d\n",
            word->best_choice->debug_string().string(),
            word->best_choice->length(), word->box_word->length());
  }
  ASSERT_HOST(word->best_choice->length() == word->box_word->length());
  // Check that the ratings matrix size matches the sum of all the
  // segmentation states.
  if (!word->StatesAllValid()) {
    tprintf("Not all words have valid states relative to ratings matrix!!");
    word->DebugWordChoices(true, NULL);
    ASSERT_HOST(word->StatesAllValid());
  }
  if (tessedit_override_permuter) {
    /* Override the permuter type if a straight dictionary check disagrees. */
    uinT8 perm_type = word->best_choice->permuter();
    if ((perm_type != SYSTEM_DAWG_PERM) &&
        (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
      uinT8 real_dict_perm_type = dict_word(*word->best_choice);
      if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
           (real_dict_perm_type == FREQ_DAWG_PERM) ||
           (real_dict_perm_type == USER_DAWG_PERM)) &&
          (alpha_count(word->best_choice->unichar_string().string(),
                       word->best_choice->unichar_lengths().string()) > 0)) {
        word->best_choice->set_permuter(real_dict_perm_type);  // use dict perm
      }
    }
    if (tessedit_rejection_debug &&
        perm_type != word->best_choice->permuter()) {
      tprintf("Permuter Type Flipped from %d to %d\n",
              perm_type, word->best_choice->permuter());
    }
  }
  // Factored out from control.cpp
  ASSERT_HOST((word->best_choice == NULL) == (word->raw_choice == NULL));
  if (word->best_choice == NULL || word->best_choice->length() == 0 ||
      static_cast<int>(strspn(word->best_choice->unichar_string().string(),
                              " ")) == word->best_choice->length()) {
    word->tess_failed = true;
    word->reject_map.initialise(word->box_word->length());
    word->reject_map.rej_word_tess_failure();
  } else {
    word->tess_failed = false;
  }
}
Exemple #2
0
/**********************************************************************
 * recog_word
 *
 * Convert the word to tess form and pass it to the tess segmenter.
 * Convert the output back to editor form.
 **********************************************************************/
WERD_CHOICE *recog_word(                           //recog one owrd
                        WERD *word,                //word to do
                        DENORM *denorm,            //de-normaliser
                        POLY_MATCHER matcher,      //matcher function
                        POLY_TESTER tester,        //tester function
                        POLY_TESTER trainer,       //trainer function
                        BOOL8 testing,             //true if answer driven
                        WERD_CHOICE *&raw_choice,  //raw result //list of blob lists
                        BLOB_CHOICE_LIST_CLIST *blob_choices,
                        WERD *&outword             //bln word output
                       ) {
  WERD_CHOICE *word_choice;
  uinT8 perm_type;
  uinT8 real_dict_perm_type;

  if (word->blob_list ()->empty ()) {
    char empty_lengths[] = {0};
    word_choice = new WERD_CHOICE ("", empty_lengths,
                                   10.0f, -1.0f, TOP_CHOICE_PERM);
    raw_choice = new WERD_CHOICE ("", empty_lengths,
                                  10.0f, -1.0f, TOP_CHOICE_PERM);
    outword = word->poly_copy (denorm->row ()->x_height ());
  }
  else
    word_choice = recog_word_recursive (word, denorm, matcher, tester,
      trainer, testing, raw_choice,
      blob_choices, outword);
  if ((word_choice->lengths ().length () !=
    outword->blob_list ()->length ()) ||
  (word_choice->lengths ().length () != blob_choices->length ())) {
    tprintf
      ("recog_word ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
      word_choice->string ().string (), word_choice->lengths ().length (),
      outword->blob_list ()->length (), blob_choices->length ());
  }
  ASSERT_HOST (word_choice->lengths ().length () ==
    outword->blob_list ()->length ());
  ASSERT_HOST (word_choice->lengths ().length () == blob_choices->length ());

  /* Copy any reject blobs into the outword */
  outword->rej_blob_list ()->deep_copy (word->rej_blob_list ());

  if (tessedit_override_permuter) {
    /* Override the permuter type if a straight dictionary check disagrees. */
    perm_type = word_choice->permuter ();
    if ((perm_type != SYSTEM_DAWG_PERM) &&
    (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
      real_dict_perm_type = dict_word (word_choice->string ().string ());
      if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
        (real_dict_perm_type == FREQ_DAWG_PERM) ||
        (real_dict_perm_type == USER_DAWG_PERM)) &&
        (alpha_count (word_choice->string ().string (),
                      word_choice->lengths ().string ()) > 0))
        word_choice->set_permuter (real_dict_perm_type);
      //Use dict perm
    }
    if (tessedit_rejection_debug && perm_type != word_choice->permuter ()) {
      tprintf ("Permuter Type Flipped from %d to %d\n",
        perm_type, word_choice->permuter ());
    }
  }
  assert ((word_choice == NULL) == (raw_choice == NULL));
  return word_choice;
}
Exemple #3
0
/**********************************************************************
 * one_ell_conflict()
 *
 * Identify words where there is a potential I/l/1 error.
 * - A bundle of contextual heuristics!
 **********************************************************************/
BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) {
  const char *word;
  const char *lengths;
  inT16 word_len;                //its length
  inT16 first_alphanum_index_;
  inT16 first_alphanum_offset_;
  inT16 i;
  inT16 offset;
  BOOL8 non_conflict_set_char;   //non conf set a/n?
  BOOL8 conflict = FALSE;
  BOOL8 allow_1s;
  ACCEPTABLE_WERD_TYPE word_type;
  BOOL8 dict_perm_type;
  BOOL8 dict_word_ok;
  int dict_word_type;

  word = word_res->best_choice->unichar_string().string ();
  lengths = word_res->best_choice->unichar_lengths().string();
  word_len = strlen (lengths);
  /*
    If there are no occurrences of the conflict set characters then the word
    is OK.
  */
  if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL)
    return FALSE;

  /*
    There is a conflict if there are NO other (confirmed) alphanumerics apart
    from those in the conflict set.
  */

  for (i = 0, offset = 0, non_conflict_set_char = FALSE;
       (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
    non_conflict_set_char =
        (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
            word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
        !STRING (conflict_set_I_l_1).contains (word[offset]);
  if (!non_conflict_set_char) {
    if (update_map)
      reject_I_1_L(word_res);
    return TRUE;
  }

  /*
    If the word is accepted by a dawg permuter, and the first alpha character
    is "I" or "l", check to see if the alternative is also a dawg word. If it
    is, then there is a potential error otherwise the word is ok.
  */

  dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
    (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
    (rej_trust_doc_dawg &&
    (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
    (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
  dict_word_type = dict_word(*(word_res->best_choice));
  dict_word_ok = (dict_word_type > 0) &&
    (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));

  if ((rej_1Il_use_dict_word && dict_word_ok) ||
    (rej_1Il_trust_permuter_type && dict_perm_type) ||
  (dict_perm_type && dict_word_ok)) {
    first_alphanum_index_ = first_alphanum_index (word, lengths);
    first_alphanum_offset_ = first_alphanum_offset (word, lengths);
    if (lengths[first_alphanum_index_] == 1 &&
        word[first_alphanum_offset_] == 'I') {
      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
      if (safe_dict_word(word_res) > 0) {
        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
        if (update_map)
          word_res->reject_map[first_alphanum_index_].
            setrej_1Il_conflict();
        return TRUE;
      }
      else {
        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
        return FALSE;
      }
    }

    if (lengths[first_alphanum_index_] == 1 &&
        word[first_alphanum_offset_] == 'l') {
      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
      if (safe_dict_word(word_res) > 0) {
        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
        if (update_map)
          word_res->reject_map[first_alphanum_index_].
            setrej_1Il_conflict();
        return TRUE;
      }
      else {
        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
        return FALSE;
      }
    }
    return FALSE;
  }

  /*
    NEW 1Il code. The old code relied on permuter types too much. In fact,
    tess will use TOP_CHOICE permute for good things like "palette".
    In this code the string is examined independently to see if it looks like
    a well formed word.
  */

  /*
    REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
    dictionary word.
  */
  first_alphanum_index_ = first_alphanum_index (word, lengths);
  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
  if (lengths[first_alphanum_index_] == 1 &&
      word[first_alphanum_offset_] == 'l') {
    word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
    if (safe_dict_word(word_res) > 0)
      return FALSE;
    else
      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
  }
  else if (lengths[first_alphanum_index_] == 1 &&
           word[first_alphanum_offset_] == 'I') {
    word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
    if (safe_dict_word(word_res) > 0)
      return FALSE;
    else
      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
  }
  /*
    For strings containing digits:
      If there are no alphas OR the numeric permuter liked the word,
        reject any non 1 conflict chs
      Else reject all conflict chs
  */
  if (word_contains_non_1_digit (word, lengths)) {
    allow_1s = (alpha_count (word, lengths) == 0) ||
      (word_res->best_choice->permuter () == NUMBER_PERM);

    inT16 offset;
    conflict = FALSE;
    for (i = 0, offset = 0; word[offset] != '\0';
         offset += word_res->best_choice->unichar_lengths()[i++]) {
      if ((!allow_1s || (word[offset] != '1')) &&
      STRING (conflict_set_I_l_1).contains (word[offset])) {
        if (update_map)
          word_res->reject_map[i].setrej_1Il_conflict ();
        conflict = TRUE;
      }
    }
    return conflict;
  }
  /*
    For anything else. See if it conforms to an acceptable word type. If so,
    treat accordingly.
  */
  word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
  if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
    first_alphanum_index_ = first_alphanum_index (word, lengths);
    first_alphanum_offset_ = first_alphanum_offset (word, lengths);
    if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
      if (update_map)
        word_res->reject_map[first_alphanum_index_].
            setrej_1Il_conflict ();
      return TRUE;
    }
    else
      return FALSE;
  }
  else if (word_type == AC_UPPER_CASE) {
    return FALSE;
  }
  else {
    if (update_map)
      reject_I_1_L(word_res);
    return TRUE;
  }
}