inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
  WERD_RES_IT word_it(&word_res_list);
  WERD_RES *word;
  PBLOB_IT blob_it;
  inT16 word_length;
  inT16 score = 0;
  inT16 i;
  inT16 offset;
  const char *chs;
  float small_limit = bln_x_height * fixsp_small_outlines_size;

  if (!fixsp_fp_eval)
    return (eval_word_spacing (word_res_list));

  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
    word = word_it.data ();
    word_length = word->reject_map.length ();
    chs = word->best_choice->string ().string ();
    if ((word->done ||
      word->tess_accepted) ||
      (word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
      (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
      (word->best_choice->permuter () == USER_DAWG_PERM) ||
    (safe_dict_word (chs) > 0)) {
      blob_it.set_to_list (word->outword->blob_list ());
      for (i = 0, offset = 0; i < word_length;
           offset += word->best_choice->lengths()[i++], blob_it.forward ()) {
        if ((chs[offset] == ' ') ||
          (blob_noise_score (blob_it.data ()) < small_limit))
          score -= 1;            //penalise possibly erroneous non-space

        else if (word->reject_map[i].accepted ())
          score++;
      }
    }
  }
  if (score < 0)
    score = 0;
  return score;
}
示例#2
0
inT16 Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
  WERD_RES_IT word_it(&word_res_list);
  WERD_RES *word;
  inT16 word_length;
  inT16 score = 0;
  inT16 i;
  float small_limit = kBlnXHeight * fixsp_small_outlines_size;

  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
    word = word_it.data();
    if (word->rebuild_word == NULL)
      continue;  // Can't handle cube words.
    word_length = word->reject_map.length();
    if (word->done ||
        word->tess_accepted ||
        word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
        word->best_choice->permuter() == FREQ_DAWG_PERM ||
        word->best_choice->permuter() == USER_DAWG_PERM ||
        safe_dict_word(word) > 0) {
      TBLOB* blob = word->rebuild_word->blobs;
      UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
      for (i = 0; i < word->best_choice->length() && blob != NULL;
           ++i, blob = blob->next) {
        if (word->best_choice->unichar_id(i) == space ||
            blob_noise_score(blob) < small_limit) {
          score -= 1;  // penalise possibly erroneous non-space
        } else if (word->reject_map[i].accepted()) {
          score++;
        }
      }
    }
  }
  if (score < 0)
    score = 0;
  return score;
}
示例#3
0
inT16 Tesseract::worst_noise_blob(WERD_RES *word_res,
                                  float *worst_noise_score) {
  float noise_score[512];
  int i;
  int min_noise_blob;            // 1st contender
  int max_noise_blob;            // last contender
  int non_noise_count;
  int worst_noise_blob;          // Worst blob
  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
  float non_noise_limit = kBlnXHeight * 0.8;

  if (word_res->rebuild_word == NULL)
    return -1;  // Can't handle cube words.

  TBLOB* blob = word_res->rebuild_word->blobs;
  // Normalised.
  int blob_count = word_res->box_word->length();
  ASSERT_HOST(blob_count <= 512);
  if (blob_count < 5)
    return -1;                   // too short to split

  /* Get the noise scores for all blobs */

  #ifndef SECURE_NAMES
  if (debug_fix_space_level > 5)
    tprintf("FP fixspace Noise metrics for \"%s\": ",
            word_res->best_choice->unichar_string().string());
  #endif

  for (i = 0; i < blob_count && blob != NULL; i++, blob = blob->next) {
    if (word_res->reject_map[i].accepted())
      noise_score[i] = non_noise_limit;
    else
      noise_score[i] = blob_noise_score(blob);

    if (debug_fix_space_level > 5)
      tprintf("%1.1f ", noise_score[i]);
  }
  if (debug_fix_space_level > 5)
    tprintf("\n");

  /* Now find the worst one which is far enough away from the end of the word */

  non_noise_count = 0;
  for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
    if (noise_score[i] >= non_noise_limit) {
      non_noise_count++;
    }
  }
  if (non_noise_count < fixsp_non_noise_limit)
    return -1;

  min_noise_blob = i;

  non_noise_count = 0;
  for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit;
       i--) {
    if (noise_score[i] >= non_noise_limit) {
      non_noise_count++;
    }
  }
  if (non_noise_count < fixsp_non_noise_limit)
    return -1;

  max_noise_blob = i;

  if (min_noise_blob > max_noise_blob)
    return -1;

  *worst_noise_score = small_limit;
  worst_noise_blob = -1;
  for (i = min_noise_blob; i <= max_noise_blob; i++) {
    if (noise_score[i] < *worst_noise_score) {
      worst_noise_blob = i;
      *worst_noise_score = noise_score[i];
    }
  }
  return worst_noise_blob;
}
inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score) {
  PBLOB_IT blob_it;
  inT16 blob_count;
  float noise_score[512];
  int i;
  int min_noise_blob;            //1st contender
  int max_noise_blob;            //last contender
  int non_noise_count;
  int worst_noise_blob;          //Worst blob
  float small_limit = bln_x_height * fixsp_small_outlines_size;
  float non_noise_limit = bln_x_height * 0.8;

  blob_it.set_to_list (word_res->outword->blob_list ());
  //normalised
  blob_count = blob_it.length ();
  ASSERT_HOST (blob_count <= 512);
  if (blob_count < 5)
    return -1;                   //too short to split
  /* Get the noise scores for all blobs */

  #ifndef SECURE_NAMES
  if (debug_fix_space_level > 5)
    tprintf ("FP fixspace Noise metrics for \"%s\": ",
      word_res->best_choice->string ().string ());
  #endif

  for (i = 0; i < blob_count; i++, blob_it.forward ()) {
    if (word_res->reject_map[i].accepted ())
      noise_score[i] = non_noise_limit;
    else
      noise_score[i] = blob_noise_score (blob_it.data ());

    if (debug_fix_space_level > 5)
      tprintf ("%1.1f ", noise_score[i]);
  }
  if (debug_fix_space_level > 5)
    tprintf ("\n");

  /* Now find the worst one which is far enough away from the end of the word */

  non_noise_count = 0;
  for (i = 0;
  (i < blob_count) && (non_noise_count < fixsp_non_noise_limit); i++) {
    if (noise_score[i] >= non_noise_limit)
      non_noise_count++;
  }
  if (non_noise_count < fixsp_non_noise_limit)
    return -1;
  min_noise_blob = i;

  non_noise_count = 0;
  for (i = blob_count - 1;
  (i >= 0) && (non_noise_count < fixsp_non_noise_limit); i--) {
    if (noise_score[i] >= non_noise_limit)
      non_noise_count++;
  }
  if (non_noise_count < fixsp_non_noise_limit)
    return -1;
  max_noise_blob = i;

  if (min_noise_blob > max_noise_blob)
    return -1;

  *worst_noise_score = small_limit;
  worst_noise_blob = -1;
  for (i = min_noise_blob; i <= max_noise_blob; i++) {
    if (noise_score[i] < *worst_noise_score) {
      worst_noise_blob = i;
      *worst_noise_score = noise_score[i];
    }
  }
  return worst_noise_blob;
}