コード例 #1
0
ファイル: bestfirst.cpp プロジェクト: ArunPandiyan/textfairy
// Creates a fake blob choice from the combination of the given fragments.
// unichar is the class to be made from the combination,
// expanded_fragment_lengths[choice_index] is the number of fragments to use.
// old_choices[choice_index] has the classifier output for each fragment.
// choice index initially indexes the last fragment and should be decremented
// expanded_fragment_lengths[choice_index] times to get the earlier fragments.
// Guarantees to return something non-null, or abort!
BLOB_CHOICE* Wordrec::rebuild_fragments(
    const char* unichar,
    const char* expanded_fragment_lengths,
    int choice_index,
    BLOB_CHOICE_LIST_VECTOR *old_choices) {
  float rating = 0.0f;
  float certainty = 0.0f;
  inT16 min_xheight = -MAX_INT16;
  inT16 max_xheight = MAX_INT16;
  for (int fragment_pieces = expanded_fragment_lengths[choice_index] - 1;
       fragment_pieces >= 0; --fragment_pieces, --choice_index) {
    // Get a pointer to the classifier results from the old_choices.
    BLOB_CHOICE_LIST *current_choices = old_choices->get(choice_index);
    // Populate fragment with updated values and look for the
    // fragment with the same values in current_choices.
    // Update rating and certainty of the character being composed.
    CHAR_FRAGMENT fragment;
    fragment.set_all(unichar, fragment_pieces,
                     expanded_fragment_lengths[choice_index], false);
    BLOB_CHOICE_IT choice_it(current_choices);
    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
        choice_it.forward()) {
      BLOB_CHOICE* choice = choice_it.data();
      const CHAR_FRAGMENT *current_fragment =
          getDict().getUnicharset().get_fragment(choice->unichar_id());
      if (current_fragment && fragment.equals(current_fragment)) {
        rating += choice->rating();
        if (choice->certainty() < certainty) {
          certainty = choice->certainty();
        }
        IntersectRange(choice->min_xheight(), choice->max_xheight(),
                       &min_xheight, &max_xheight);
        break;
      }
    }
    if (choice_it.cycled_list()) {
      print_ratings_list("Failure", current_choices, unicharset);
      tprintf("Failed to find fragment %s at index=%d\n",
              fragment.to_string().string(), choice_index);
    }
    ASSERT_HOST(!choice_it.cycled_list());  // Be sure we found the fragment.
  }
  return new BLOB_CHOICE(getDict().getUnicharset().unichar_to_id(unichar),
                         rating, certainty, -1, -1, 0,
                         min_xheight, max_xheight, false);
}
コード例 #2
0
void LMConsistencyInfo::ComputeXheightConsistency(
    const BLOB_CHOICE *b, bool is_punc) {
    if (xht_decision == XH_INCONSISTENT)
        return;  // It isn't going to get any better.

    // Compute xheight consistency.
    bool parent_null = xht_sp < 0;
    int parent_sp = xht_sp;
    // Debug strings.
    if (b->yshift() > LMConsistencyInfo::kShiftThresh) {
        xht_sp = LMConsistencyInfo::kSUP;
    } else if (b->yshift() < -LMConsistencyInfo::kShiftThresh) {
        xht_sp = LMConsistencyInfo::kSUB;
    } else {
        xht_sp = LMConsistencyInfo::kNORM;
    }
    xht_count[xht_sp]++;
    if (is_punc) xht_count_punc[xht_sp]++;
    if (!parent_null) {
        xpos_entropy += abs(parent_sp - xht_sp);
    }
    // TODO(eger): Figure out a better way to account for small caps.
    // For the first character not y-shifted, we only care if it is too small.
    // Too large is common in drop caps and small caps.
    // inT16 small_xht = b->min_xheight();
    //  if (parent_vse == NULL && sp == LanguageModelConsistencyInfo::kNORM) {
    //  small_xht = 0;
    // }
    IntersectRange(b->min_xheight(), b->max_xheight(),
                   &(xht_lo[xht_sp]), &(xht_hi[xht_sp]));


    // Compute xheight inconsistency kinds.
    if (parent_null) {
        if (xht_count[kNORM] == 1) {
            xht_decision = XH_GOOD;
        } else {
            xht_decision = XH_SUBNORMAL;
        }
        return;
    }

    // When we intersect the ranges of xheights in pixels for all characters in
    // each position (subscript, normal, superscript),
    // How much range must be left?  0? [exactly one pixel height for xheight] 1?
    // TODO(eger): Extend this code to take a prior for the rest of the line.
    const int kMinIntersectedXHeightRange = 0;
    for (int i = 0; i < kNumPos; i++) {
        if (xht_lo[i] > xht_hi[i] - kMinIntersectedXHeightRange) {
            xht_decision = XH_INCONSISTENT;
            return;
        }
    }

    // Reject as improbable anything where there's much punctuation in subscript
    // or superscript regions.
    if (xht_count_punc[kSUB] > xht_count[kSUB] * 0.4 ||
            xht_count_punc[kSUP] > xht_count[kSUP] * 0.4) {
        xht_decision = XH_INCONSISTENT;
        return;
    }

    // Now check that the subscript and superscript aren't too small relative to
    // the mainline.
    double mainline_xht = static_cast<double>(xht_lo[kNORM]);
    double kMinSizeRatio = 0.4;
    if (mainline_xht > 0.0 &&
            (static_cast<double>(xht_hi[kSUB]) / mainline_xht < kMinSizeRatio ||
             static_cast<double>(xht_hi[kSUP]) / mainline_xht < kMinSizeRatio)) {
        xht_decision = XH_INCONSISTENT;
        return;
    }
    // TODO(eger): Check into inconsistency of super/subscript y offsets.
    if (xpos_entropy > kMaxEntropy) {
        xht_decision = XH_INCONSISTENT;
        return;
    }
    if (xht_count[kSUB] == 0 && xht_count[kSUP] == 0) {
        xht_decision = XH_GOOD;
        return;
    }
    xht_decision = XH_SUBNORMAL;
}
コード例 #3
0
ファイル: pieces.cpp プロジェクト: 11110101/tess-two
/**********************************************************************
 * merge_and_put_fragment_lists
 *
 * Merge the fragment lists in choice_lists and append it to the
 * ratings matrix.
 **********************************************************************/
void Wordrec::merge_and_put_fragment_lists(inT16 row, inT16 column,
                                           inT16 num_frag_parts,
                                           BLOB_CHOICE_LIST *choice_lists,
                                           MATRIX *ratings) {
  BLOB_CHOICE_IT *choice_lists_it = new BLOB_CHOICE_IT[num_frag_parts];

  for (int i = 0; i < num_frag_parts; i++) {
    choice_lists_it[i].set_to_list(&choice_lists[i]);
    choice_lists_it[i].mark_cycle_pt();
  }

  BLOB_CHOICE_LIST *merged_choice = ratings->get(row, column);
  if (merged_choice == NULL)
    merged_choice = new BLOB_CHOICE_LIST;

  bool end_of_list = false;
  BLOB_CHOICE_IT merged_choice_it(merged_choice);
  while (!end_of_list) {
    // Find the maximum unichar_id of the current entry the iterators
    // are pointing at
    UNICHAR_ID max_unichar_id = choice_lists_it[0].data()->unichar_id();
    for (int i = 0; i < num_frag_parts; i++) {
      UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
      if (max_unichar_id < unichar_id) {
        max_unichar_id = unichar_id;
      }
    }

    // Move the each iterators until it gets to an entry that has a
    // value greater than or equal to max_unichar_id
    for (int i = 0; i < num_frag_parts; i++) {
      UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
      while (!choice_lists_it[i].cycled_list() &&
             unichar_id < max_unichar_id) {
        choice_lists_it[i].forward();
        unichar_id = choice_lists_it[i].data()->unichar_id();
      }
      if (choice_lists_it[i].cycled_list()) {
        end_of_list = true;
        break;
      }
    }

    if (end_of_list)
      break;

    // Checks if the fragments are parts of the same character
    UNICHAR_ID first_unichar_id = choice_lists_it[0].data()->unichar_id();
    bool same_unichar = true;
    for (int i = 1; i < num_frag_parts; i++) {
      UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
      if (unichar_id != first_unichar_id) {
        same_unichar = false;
        break;
      }
    }

    if (same_unichar) {
      // Add the merged character to the result
      UNICHAR_ID merged_unichar_id = first_unichar_id;
      inT16 merged_fontinfo_id = choice_lists_it[0].data()->fontinfo_id();
      inT16 merged_fontinfo_id2 = choice_lists_it[0].data()->fontinfo_id2();
      float merged_min_xheight = choice_lists_it[0].data()->min_xheight();
      float merged_max_xheight = choice_lists_it[0].data()->max_xheight();
      float positive_yshift = 0, negative_yshift = 0;
      int merged_script_id = choice_lists_it[0].data()->script_id();
      BlobChoiceClassifier classifier = choice_lists_it[0].data()->classifier();

      float merged_rating = 0, merged_certainty = 0;
      for (int i = 0; i < num_frag_parts; i++) {
        float rating = choice_lists_it[i].data()->rating();
        float certainty = choice_lists_it[i].data()->certainty();

        if (i == 0 || certainty < merged_certainty)
          merged_certainty = certainty;
        merged_rating += rating;

        choice_lists_it[i].forward();
        if (choice_lists_it[i].cycled_list())
          end_of_list = true;
        IntersectRange(choice_lists_it[i].data()->min_xheight(),
                       choice_lists_it[i].data()->max_xheight(),
                       &merged_min_xheight, &merged_max_xheight);
        float yshift = choice_lists_it[i].data()->yshift();
        if (yshift > positive_yshift) positive_yshift = yshift;
        if (yshift < negative_yshift) negative_yshift = yshift;
      }

      float merged_yshift = positive_yshift != 0
          ? (negative_yshift != 0 ? 0 : positive_yshift)
          : negative_yshift;
      merged_choice_it.add_to_end(new BLOB_CHOICE(merged_unichar_id,
                                                  merged_rating,
                                                  merged_certainty,
                                                  merged_fontinfo_id,
                                                  merged_fontinfo_id2,
                                                  merged_script_id,
                                                  merged_min_xheight,
                                                  merged_max_xheight,
                                                  merged_yshift,
                                                  classifier));
    }
  }

  if (classify_debug_level)
    print_ratings_list("Merged Fragments", merged_choice,
                       unicharset);

  if (merged_choice->empty())
    delete merged_choice;
  else
    ratings->put(row, column, merged_choice);

  delete [] choice_lists_it;
}