예제 #1
0
/**********************************************************************
 * split_and_recog_word
 *
 * Split the word into 2 smaller pieces at the largest gap.
 * Recognize the pieces and stick the results back together.
 **********************************************************************/
void Tesseract::split_and_recog_word(WERD_RES *word) {
  // Find the biggest blob gap in the chopped_word.
  int bestgap = -MAX_INT32;
  int split_index = 0;
  for (int b = 1; b < word->chopped_word->NumBlobs(); ++b) {
    TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
    TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
    int gap = blob_box.left() - prev_box.right();
    if (gap > bestgap) {
      bestgap = gap;
      split_index = b;
    }
  }
  ASSERT_HOST(split_index > 0);

  WERD_RES *word2 = NULL;
  BlamerBundle *orig_bb = NULL;
  split_word(word, split_index, &word2, &orig_bb);

  // Recognize the first part of the word.
  recog_word_recursive(word);
  // Recognize the second part of the word.
  recog_word_recursive(word2);

  join_words(word, word2, orig_bb);
}
예제 #2
0
void Tesseract::recog_word(WERD_RES *word) {
  if (wordrec_skip_no_truth_words && (word->blamer_bundle == NULL ||
      word->blamer_bundle->incorrect_result_reason() == IRR_NO_TRUTH)) {
    if (classify_debug_level) tprintf("No truth for word - skipping\n");
    word->tess_failed = true;
    return;
  }
  ASSERT_HOST(!word->chopped_word->blobs.empty());
  recog_word_recursive(word);
  word->SetupBoxWord();
  if (word->best_choice->length() != word->box_word->length()) {
    tprintf("recog_word ASSERT FAIL String:\"%s\"; "
            "Strlen=%d; #Blobs=%d\n",
            word->best_choice->debug_string().string(),
            word->best_choice->length(), word->box_word->length());
  }
  ASSERT_HOST(word->best_choice->length() == word->box_word->length());
  // Check that the ratings matrix size matches the sum of all the
  // segmentation states.
  if (!word->StatesAllValid()) {
    tprintf("Not all words have valid states relative to ratings matrix!!");
    word->DebugWordChoices(true, NULL);
    ASSERT_HOST(word->StatesAllValid());
  }
  if (tessedit_override_permuter) {
    /* Override the permuter type if a straight dictionary check disagrees. */
    uinT8 perm_type = word->best_choice->permuter();
    if ((perm_type != SYSTEM_DAWG_PERM) &&
        (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
      uinT8 real_dict_perm_type = dict_word(*word->best_choice);
      if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
           (real_dict_perm_type == FREQ_DAWG_PERM) ||
           (real_dict_perm_type == USER_DAWG_PERM)) &&
          (alpha_count(word->best_choice->unichar_string().string(),
                       word->best_choice->unichar_lengths().string()) > 0)) {
        word->best_choice->set_permuter(real_dict_perm_type);  // use dict perm
      }
    }
    if (tessedit_rejection_debug &&
        perm_type != word->best_choice->permuter()) {
      tprintf("Permuter Type Flipped from %d to %d\n",
              perm_type, word->best_choice->permuter());
    }
  }
  // Factored out from control.cpp
  ASSERT_HOST((word->best_choice == NULL) == (word->raw_choice == NULL));
  if (word->best_choice == NULL || word->best_choice->length() == 0 ||
      static_cast<int>(strspn(word->best_choice->unichar_string().string(),
                              " ")) == word->best_choice->length()) {
    word->tess_failed = true;
    word->reject_map.initialise(word->box_word->length());
    word->reject_map.rej_word_tess_failure();
  } else {
    word->tess_failed = false;
  }
}
예제 #3
0
파일: tfacepp.cpp 프로젝트: chanchai/botker
/**********************************************************************
 * recog_word
 *
 * Convert the word to tess form and pass it to the tess segmenter.
 * Convert the output back to editor form.
 **********************************************************************/
WERD_CHOICE *recog_word(                           //recog one owrd
                        WERD *word,                //word to do
                        DENORM *denorm,            //de-normaliser
                        POLY_MATCHER matcher,      //matcher function
                        POLY_TESTER tester,        //tester function
                        POLY_TESTER trainer,       //trainer function
                        BOOL8 testing,             //true if answer driven
                        WERD_CHOICE *&raw_choice,  //raw result //list of blob lists
                        BLOB_CHOICE_LIST_CLIST *blob_choices,
                        WERD *&outword             //bln word output
                       ) {
  WERD_CHOICE *word_choice;
  uinT8 perm_type;
  uinT8 real_dict_perm_type;

  if (word->blob_list ()->empty ()) {
    char empty_lengths[] = {0};
    word_choice = new WERD_CHOICE ("", empty_lengths,
                                   10.0f, -1.0f, TOP_CHOICE_PERM);
    raw_choice = new WERD_CHOICE ("", empty_lengths,
                                  10.0f, -1.0f, TOP_CHOICE_PERM);
    outword = word->poly_copy (denorm->row ()->x_height ());
  }
  else
    word_choice = recog_word_recursive (word, denorm, matcher, tester,
      trainer, testing, raw_choice,
      blob_choices, outword);
  if ((word_choice->lengths ().length () !=
    outword->blob_list ()->length ()) ||
  (word_choice->lengths ().length () != blob_choices->length ())) {
    tprintf
      ("recog_word ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
      word_choice->string ().string (), word_choice->lengths ().length (),
      outword->blob_list ()->length (), blob_choices->length ());
  }
  ASSERT_HOST (word_choice->lengths ().length () ==
    outword->blob_list ()->length ());
  ASSERT_HOST (word_choice->lengths ().length () == blob_choices->length ());

  /* Copy any reject blobs into the outword */
  outword->rej_blob_list ()->deep_copy (word->rej_blob_list ());

  if (tessedit_override_permuter) {
    /* Override the permuter type if a straight dictionary check disagrees. */
    perm_type = word_choice->permuter ();
    if ((perm_type != SYSTEM_DAWG_PERM) &&
    (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
      real_dict_perm_type = dict_word (word_choice->string ().string ());
      if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
        (real_dict_perm_type == FREQ_DAWG_PERM) ||
        (real_dict_perm_type == USER_DAWG_PERM)) &&
        (alpha_count (word_choice->string ().string (),
                      word_choice->lengths ().string ()) > 0))
        word_choice->set_permuter (real_dict_perm_type);
      //Use dict perm
    }
    if (tessedit_rejection_debug && perm_type != word_choice->permuter ()) {
      tprintf ("Permuter Type Flipped from %d to %d\n",
        perm_type, word_choice->permuter ());
    }
  }
  assert ((word_choice == NULL) == (raw_choice == NULL));
  return word_choice;
}
예제 #4
0
파일: tfacepp.cpp 프로젝트: chanchai/botker
WERD_CHOICE *split_and_recog_word(                           //recog one owrd
                                  WERD *word,                //word to do
                                  DENORM *denorm,            //de-normaliser
                                  POLY_MATCHER matcher,      //matcher function
                                  POLY_TESTER tester,        //tester function
                                  POLY_TESTER trainer,       //trainer function
                                  BOOL8 testing,             //true if answer driven
                                  WERD_CHOICE *&raw_choice,  //raw result //list of blob lists
                                  BLOB_CHOICE_LIST_CLIST *blob_choices,
                                  WERD *&outword             //bln word output
                                 ) {
  //   inT32                                                      outword1_len;
  //   inT32                                                      outword2_len;
  WERD *first_word;              //poly copy of word
  WERD *second_word;             //fabricated word
  WERD *outword2;                //2nd output word
  PBLOB *blob;
  WERD_CHOICE *result;           //resturn value
  WERD_CHOICE *result2;          //output of 2nd word
  WERD_CHOICE *raw_choice2;      //raw version of 2nd
  float gap;                     //blob gap
  float bestgap;                 //biggest gap
  PBLOB_LIST new_blobs;          //list of gathered blobs
  PBLOB_IT blob_it;
                                 //iterator
  PBLOB_IT new_blob_it = &new_blobs;

  first_word = word->poly_copy (denorm->row ()->x_height ());
  blob_it.set_to_list (first_word->blob_list ());
  bestgap = -MAX_INT32;
  while (!blob_it.at_last ()) {
    blob = blob_it.data ();
                                 //gap to next
    gap = blob_it.data_relative (1)->bounding_box ().left () - blob->bounding_box ().right ();
    blob_it.forward ();
    if (gap > bestgap) {
      bestgap = gap;             //find biggest
      new_blob_it = blob_it;     //save position
    }
  }
                                 //take 2nd half
  new_blobs.assign_to_sublist (&new_blob_it, &blob_it);
                                 //make it a word
  second_word = new WERD (&new_blobs, 1, NULL);
  ASSERT_HOST (word->blob_list ()->length () ==
    first_word->blob_list ()->length () +
    second_word->blob_list ()->length ());

  result = recog_word_recursive (first_word, denorm, matcher,
    tester, trainer, testing, raw_choice,
    blob_choices, outword);
  delete first_word;             //done that one
  result2 = recog_word_recursive (second_word, denorm, matcher,
    tester, trainer, testing, raw_choice2,
    blob_choices, outword2);
  delete second_word;            //done that too
  *result += *result2;           //combine ratings
  delete result2;
  *raw_choice += *raw_choice2;
  delete raw_choice2;            //finished with it
  //   outword1_len= outword->blob_list()->length();
  //   outword2_len= outword2->blob_list()->length();
  outword->join_on (outword2);   //join words
  delete outword2;
  //   if ( outword->blob_list()->length() != outword1_len + outword2_len )
  //      tprintf( "Split&Recog: part1len=%d; part2len=%d; combinedlen=%d\n",
  //                                outword1_len, outword2_len, outword->blob_list()->length() );
  //   ASSERT_HOST( outword->blob_list()->length() == outword1_len + outword2_len );
  return result;
}
예제 #5
0
/**
 * Try splitting off the given number of (chopped) blobs from the front and
 * back of the given word and recognizing the pieces.
 *
 * @param[in]  num_chopped_leading   how many chopped blobs from the left
 *                    end of the word to chop off and try recognizing as a
 *                    superscript (or subscript)
 * @param[in]  leading_certainty     the (minimum) certainty had by the
 *                    characters in the original leading section.
 * @param[in]  leading_pos    "super" or "sub" (for debugging)
 * @param[in]  num_chopped_trailing  how many chopped blobs from the right
 *                    end of the word to chop off and try recognizing as a
 *                    superscript (or subscript)
 * @param[in]  trailing_certainty    the (minimum) certainty had by the
 *                    characters in the original trailing section.
 * @param[in]  trailing_pos      "super" or "sub" (for debugging)
 * @param[in]  word              the word to try to chop up.
 * @param[out] is_good           do we believe our result?
 * @param[out] retry_rebuild_leading, retry_rebuild_trailing
 *         If non-zero, and !is_good, then the caller may have luck trying
 *         to split the returned word with this number of (rebuilt) leading
 *         and trailing blobs / unichars.
 * @return A word which is the result of re-recognizing as asked.
 */
WERD_RES *Tesseract::TrySuperscriptSplits(
    int num_chopped_leading, float leading_certainty, ScriptPos leading_pos,
    int num_chopped_trailing, float trailing_certainty,
    ScriptPos trailing_pos,
    WERD_RES *word,
    bool *is_good,
    int *retry_rebuild_leading, int *retry_rebuild_trailing) {
  int num_chopped = word->chopped_word->NumBlobs();

  *retry_rebuild_leading = *retry_rebuild_trailing = 0;

  // Chop apart the word into up to three pieces.

  BlamerBundle *bb0 = NULL;
  BlamerBundle *bb1 = NULL;
  WERD_RES *prefix = NULL;
  WERD_RES *core = NULL;
  WERD_RES *suffix = NULL;
  if (num_chopped_leading > 0) {
    prefix = new WERD_RES(*word);
    split_word(prefix, num_chopped_leading, &core, &bb0);
  } else {
    core = new WERD_RES(*word);
  }

  if (num_chopped_trailing > 0) {
    int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;
    split_word(core, split_pt, &suffix, &bb1);
  }

  //  Recognize the pieces in turn.
  int saved_cp_multiplier = classify_class_pruner_multiplier;
  int saved_im_multiplier = classify_integer_matcher_multiplier;
  if (prefix) {
    // Turn off Tesseract's y-position penalties for the leading superscript.
    classify_class_pruner_multiplier.set_value(0);
    classify_integer_matcher_multiplier.set_value(0);

    // Adjust our expectations about the baseline for this prefix.
    if (superscript_debug >= 3) {
      tprintf(" recognizing first %d chopped blobs\n", num_chopped_leading);
    }
    recog_word_recursive(prefix);
    if (superscript_debug >= 2) {
      tprintf(" The leading bits look like %s %s\n",
              ScriptPosToString(leading_pos),
              prefix->best_choice->unichar_string().string());
    }

    // Restore the normal y-position penalties.
    classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
    classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
  }

  if (superscript_debug >= 3) {
    tprintf(" recognizing middle %d chopped blobs\n",
            num_chopped - num_chopped_leading - num_chopped_trailing);
  }

  if (suffix) {
    // Turn off Tesseract's y-position penalties for the trailing superscript.
    classify_class_pruner_multiplier.set_value(0);
    classify_integer_matcher_multiplier.set_value(0);

    if (superscript_debug >= 3) {
      tprintf(" recognizing last %d chopped blobs\n", num_chopped_trailing);
    }
    recog_word_recursive(suffix);
    if (superscript_debug >= 2) {
      tprintf(" The trailing bits look like %s %s\n",
              ScriptPosToString(trailing_pos),
              suffix->best_choice->unichar_string().string());
    }

    // Restore the normal y-position penalties.
    classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
    classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
  }

  // Evaluate whether we think the results are believably better
  // than what we already had.
  bool good_prefix = !prefix || BelievableSuperscript(
      superscript_debug >= 1, *prefix,
      superscript_bettered_certainty * leading_certainty,
      retry_rebuild_leading, NULL);
  bool good_suffix = !suffix || BelievableSuperscript(
      superscript_debug >= 1, *suffix,
      superscript_bettered_certainty * trailing_certainty,
      NULL, retry_rebuild_trailing);

  *is_good = good_prefix && good_suffix;
  if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {
    // None of it is any good. Quit now.
    delete core;
    delete prefix;
    delete suffix;
    return NULL;
  }
  recog_word_recursive(core);

  // Now paste the results together into core.
  if (suffix) {
    suffix->SetAllScriptPositions(trailing_pos);
    join_words(core, suffix, bb1);
  }
  if (prefix) {
    prefix->SetAllScriptPositions(leading_pos);
    join_words(prefix, core, bb0);
    core = prefix;
    prefix = NULL;
  }

  if (superscript_debug >= 1) {
    tprintf("%s superscript fix: %s\n", *is_good ? "ACCEPT" : "REJECT",
            core->best_choice->unichar_string().string());
  }
  return core;
}