Ejemplo n.º 1
0
/**********************************************************************
 * fill_filtered_fragment_list
 *
 * Filter the fragment list so that the filtered_choices only contain
 * fragments that are in the correct position. choices is the list
 * that we are going to filter. fragment_pos is the position in the
 * fragment that we are looking for and num_frag_parts is the the
 * total number of pieces. The result will be appended to
 * filtered_choices.
 **********************************************************************/
void Wordrec::fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices,
        int fragment_pos,
        int num_frag_parts,
        BLOB_CHOICE_LIST *filtered_choices) {
    BLOB_CHOICE_IT filtered_choices_it(filtered_choices);
    BLOB_CHOICE_IT choices_it(choices);

    for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
            choices_it.forward()) {
        UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
        const CHAR_FRAGMENT *frag = unicharset.get_fragment(choice_unichar_id);

        if (frag != NULL && frag->get_pos() == fragment_pos &&
                frag->get_total() == num_frag_parts) {
            // Recover the unichar_id of the unichar that this fragment is
            // a part of
            BLOB_CHOICE *b = new BLOB_CHOICE(*choices_it.data());
            int original_unichar = unicharset.unichar_to_id(frag->get_unichar());
            b->set_unichar_id(original_unichar);
            filtered_choices_it.add_to_end(b);
        }
    }

    filtered_choices->sort(SortByUnicharID<BLOB_CHOICE>);
}
Ejemplo n.º 2
0
// Returns true if *this and other agree on the baseline and x-height
// to within some tolerance based on a given estimate of the x-height.
bool BLOB_CHOICE::PosAndSizeAgree(const BLOB_CHOICE& other, float x_height,
                                  bool debug) const {
  double baseline_diff = fabs(yshift() - other.yshift());
  if (baseline_diff > kMaxBaselineDrift * x_height) {
    if (debug) {
      tprintf("Baseline diff %g for %d v %d\n",
              baseline_diff, unichar_id_, other.unichar_id_);
    }
    return false;
  }
  double this_range = max_xheight() - min_xheight();
  double other_range = other.max_xheight() - other.min_xheight();
  double denominator = ClipToRange(MIN(this_range, other_range),
                                   1.0, kMaxOverlapDenominator * x_height);
  double overlap = MIN(max_xheight(), other.max_xheight()) -
                   MAX(min_xheight(), other.min_xheight());
  overlap /= denominator;
  if (debug) {
    tprintf("PosAndSize for %d v %d: bl diff = %g, ranges %g, %g / %g ->%g\n",
            unichar_id_, other.unichar_id_, baseline_diff,
            this_range, other_range, denominator, overlap);
  }

  return overlap >= kMinXHeightMatch;
}
Ejemplo n.º 3
0
/**********************************************************************
 * BLOB_CHOICE::BLOB_CHOICE
 *
 * Constructor to build a BLOB_CHOICE from another BLOB_CHOICE.
 **********************************************************************/
BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) {
  unichar_id_ = other.unichar_id();
  rating_ = other.rating();
  certainty_ = other.certainty();
  config_ = other.config();
  script_id_ = other.script_id();
}
Ejemplo n.º 4
0
// Returns a bigger MATRIX with a new column and row in the matrix in order
// to split the blob at the given (ind,ind) diagonal location.
// Entries are relocated to the new MATRIX using the transformation defined
// by MATRIX_COORD::MapForSplit.
// Transfers the pointer data to the new MATRIX and deletes *this.
MATRIX* MATRIX::ConsumeAndMakeBigger(int ind) {
  int dim = dimension();
  int band_width = bandwidth();
  // Check to see if bandwidth needs expanding.
  for (int col = ind; col >= 0 && col > ind - band_width; --col) {
    if (array_[col * band_width + band_width - 1] != empty_) {
      ++band_width;
      break;
    }
  }
  MATRIX* result = new MATRIX(dim + 1, band_width);

  for (int col = 0; col < dim; ++col) {
    for (int row = col; row < dim && row < col + bandwidth(); ++row) {
      MATRIX_COORD coord(col, row);
      coord.MapForSplit(ind);
      BLOB_CHOICE_LIST* choices = get(col, row);
      if (choices != NULL) {
        // Correct matrix location on each choice.
        BLOB_CHOICE_IT bc_it(choices);
        for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
          BLOB_CHOICE* choice = bc_it.data();
          choice->set_matrix_cell(coord.col, coord.row);
        }
        ASSERT_HOST(coord.Valid(*result));
        result->put(coord.col, coord.row, choices);
      }
    }
  }
  delete this;
  return result;
}
Ejemplo n.º 5
0
// Returns true if there are any real classification results.
bool MATRIX::Classified(int col, int row, int wildcard_id) const {
  if (get(col, row) == NOT_CLASSIFIED) return false;
  BLOB_CHOICE_IT b_it(get(col, row));
  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
    BLOB_CHOICE* choice = b_it.data();
    if (choice->IsClassified())
      return true;
  }
  return false;
}
Ejemplo n.º 6
0
// Helper to find the BLOB_CHOICE in the bc_list that matches the given
// unichar_id, or NULL if there is no match.
BLOB_CHOICE* FindMatchingChoice(UNICHAR_ID char_id,
                                BLOB_CHOICE_LIST* bc_list) {
  // Find the corresponding best BLOB_CHOICE.
  BLOB_CHOICE_IT choice_it(bc_list);
  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
       choice_it.forward()) {
    BLOB_CHOICE* choice = choice_it.data();
    if (choice->unichar_id() == char_id) {
      return choice;
    }
  }
  return NULL;
}
Ejemplo n.º 7
0
/**
 * append_choices
 *
 * Checks to see whether or not the next choice is worth appending to
 * the word being generated. If so then keeps going deeper into the word.
 *
 * This function assumes that Dict::go_deeper_fxn_ is set.
 */
void Dict::append_choices(
    const char *debug,
    const BLOB_CHOICE_LIST_VECTOR &char_choices,
    const BLOB_CHOICE &blob_choice,
    int char_choice_index,
    const CHAR_FRAGMENT_INFO *prev_char_frag_info,
    WERD_CHOICE *word,
    float certainties[],
    float *limit,
    WERD_CHOICE *best_choice,
    int *attempts_left,
    void *more_args) {
  int word_ending =
    (char_choice_index == char_choices.length() - 1) ? true : false;

  // Deal with fragments.
  CHAR_FRAGMENT_INFO char_frag_info;
  if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(),
                           blob_choice.certainty(), prev_char_frag_info, debug,
                           word_ending, &char_frag_info)) {
    return;  // blob_choice must be an invalid fragment
  }
  // Search the next letter if this character is a fragment.
  if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
    permute_choices(debug, char_choices, char_choice_index + 1,
                    &char_frag_info, word, certainties, limit,
                    best_choice, attempts_left, more_args);
    return;
  }

  // Add the next unichar.
  float old_rating = word->rating();
  float old_certainty = word->certainty();
  uint8_t old_permuter = word->permuter();
  certainties[word->length()] = char_frag_info.certainty;
  word->append_unichar_id_space_allocated(
      char_frag_info.unichar_id, char_frag_info.num_fragments,
      char_frag_info.rating, char_frag_info.certainty);

  // Explore the next unichar.
  (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index,
                          &char_frag_info, word_ending, word, certainties,
                          limit, best_choice, attempts_left, more_args);

  // Remove the unichar we added to explore other choices in it's place.
  word->remove_last_unichar_id();
  word->set_rating(old_rating);
  word->set_certainty(old_certainty);
  word->set_permuter(old_permuter);
}
Ejemplo n.º 8
0
/**
 * BLOB_CHOICE::BLOB_CHOICE
 *
 * Constructor to build a BLOB_CHOICE from another BLOB_CHOICE.
 */
BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) {
  unichar_id_ = other.unichar_id();
  rating_ = other.rating();
  certainty_ = other.certainty();
  config_ = other.config();
  config2_ = other.config2();
  script_id_ = other.script_id();
  language_model_state_ = NULL;
}
Ejemplo n.º 9
0
/**
 * BLOB_CHOICE::BLOB_CHOICE
 *
 * Constructor to build a BLOB_CHOICE from another BLOB_CHOICE.
 */
BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) {
  unichar_id_ = other.unichar_id();
  rating_ = other.rating();
  certainty_ = other.certainty();
  fontinfo_id_ = other.fontinfo_id();
  fontinfo_id2_ = other.fontinfo_id2();
  script_id_ = other.script_id();
  matrix_cell_ = other.matrix_cell_;
  min_xheight_ = other.min_xheight_;
  max_xheight_ = other.max_xheight_;
  yshift_ = other.yshift();
  classifier_ = other.classifier_;
}
Ejemplo n.º 10
0
/**
 * BLOB_CHOICE::BLOB_CHOICE
 *
 * Constructor to build a BLOB_CHOICE from another BLOB_CHOICE.
 */
BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) {
  unichar_id_ = other.unichar_id();
  rating_ = other.rating();
  certainty_ = other.certainty();
  fontinfo_id_ = other.fontinfo_id();
  fontinfo_id2_ = other.fontinfo_id2();
  script_id_ = other.script_id();
  language_model_state_ = NULL;
  min_xheight_ = other.min_xheight_;
  max_xheight_ = other.max_xheight_;
  adapted_ = other.adapted_;
}
Ejemplo n.º 11
0
// Creates a fake blob choice from the combination of the given fragments.
// unichar is the class to be made from the combination,
// expanded_fragment_lengths[choice_index] is the number of fragments to use.
// old_choices[choice_index] has the classifier output for each fragment.
// choice index initially indexes the last fragment and should be decremented
// expanded_fragment_lengths[choice_index] times to get the earlier fragments.
// Guarantees to return something non-null, or abort!
BLOB_CHOICE* Wordrec::rebuild_fragments(
    const char* unichar,
    const char* expanded_fragment_lengths,
    int choice_index,
    BLOB_CHOICE_LIST_VECTOR *old_choices) {
  float rating = 0.0f;
  float certainty = 0.0f;
  inT16 min_xheight = -MAX_INT16;
  inT16 max_xheight = MAX_INT16;
  for (int fragment_pieces = expanded_fragment_lengths[choice_index] - 1;
       fragment_pieces >= 0; --fragment_pieces, --choice_index) {
    // Get a pointer to the classifier results from the old_choices.
    BLOB_CHOICE_LIST *current_choices = old_choices->get(choice_index);
    // Populate fragment with updated values and look for the
    // fragment with the same values in current_choices.
    // Update rating and certainty of the character being composed.
    CHAR_FRAGMENT fragment;
    fragment.set_all(unichar, fragment_pieces,
                     expanded_fragment_lengths[choice_index], false);
    BLOB_CHOICE_IT choice_it(current_choices);
    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
        choice_it.forward()) {
      BLOB_CHOICE* choice = choice_it.data();
      const CHAR_FRAGMENT *current_fragment =
          getDict().getUnicharset().get_fragment(choice->unichar_id());
      if (current_fragment && fragment.equals(current_fragment)) {
        rating += choice->rating();
        if (choice->certainty() < certainty) {
          certainty = choice->certainty();
        }
        IntersectRange(choice->min_xheight(), choice->max_xheight(),
                       &min_xheight, &max_xheight);
        break;
      }
    }
    if (choice_it.cycled_list()) {
      print_ratings_list("Failure", current_choices, unicharset);
      tprintf("Failed to find fragment %s at index=%d\n",
              fragment.to_string().string(), choice_index);
    }
    ASSERT_HOST(!choice_it.cycled_list());  // Be sure we found the fragment.
  }
  return new BLOB_CHOICE(getDict().getUnicharset().unichar_to_id(unichar),
                         rating, certainty, -1, -1, 0,
                         min_xheight, max_xheight, false);
}
Ejemplo n.º 12
0
LIST call_matcher(                  //call a matcher
                  TBLOB *ptblob,    //previous
                  TBLOB *tessblob,  //blob to match
                  TBLOB *ntblob,    //next
                  void *,           //unused parameter
                  TEXTROW *         //always null anyway
                 ) {
  PBLOB *pblob;                  //converted blob
  PBLOB *blob;                   //converted blob
  PBLOB *nblob;                  //converted blob
  LIST result;                   //tess output
  BLOB_CHOICE *choice;           //current choice
  BLOB_CHOICE_LIST ratings;      //matcher result
  BLOB_CHOICE_IT it;             //iterator
  char choice_lengths[2] = {0, 0};

  blob = make_ed_blob (tessblob);//convert blob
  if (blob == NULL) {
    // Since it is actually possible to get a NULL blob here, due to invalid
    // segmentations, fake a really bad classification.
    choice_lengths[0] = strlen(unicharset.id_to_unichar(1));
    return append_choice(NULL, unicharset.id_to_unichar(1), choice_lengths,
                         static_cast<float>(MAX_NUM_INT_FEATURES),
                         static_cast<float>(kReallyBadCertainty), 0);
  }
  pblob = ptblob != NULL ? make_ed_blob (ptblob) : NULL;
  nblob = ntblob != NULL ? make_ed_blob (ntblob) : NULL;
  (*tess_matcher) (pblob, blob, nblob, tess_word, tess_denorm, ratings);
  //match it
  delete blob;                   //don't need that now
  if (pblob != NULL)
    delete pblob;
  if (nblob != NULL)
    delete nblob;
  it.set_to_list (&ratings);     //get list
  result = NULL;
  for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
    choice = it.data ();
    choice_lengths[0] = strlen(choice->unichar ());
    result = append_choice (result, choice->unichar (),
                            choice_lengths, choice->rating (),
                            choice->certainty (), choice->config ());
  }
  return result;                 //converted list
}
Ejemplo n.º 13
0
/**********************************************************************
 * merge_and_put_fragment_lists
 *
 * Merge the fragment lists in choice_lists and append it to the
 * ratings matrix.
 **********************************************************************/
void Wordrec::merge_and_put_fragment_lists(inT16 row, inT16 column,
                                           inT16 num_frag_parts,
                                           BLOB_CHOICE_LIST *choice_lists,
                                           MATRIX *ratings) {
  BLOB_CHOICE_IT *choice_lists_it = new BLOB_CHOICE_IT[num_frag_parts];

  for (int i = 0; i < num_frag_parts; i++) {
    choice_lists_it[i].set_to_list(&choice_lists[i]);
    choice_lists_it[i].mark_cycle_pt();
  }

  BLOB_CHOICE_LIST *merged_choice = ratings->get(row, column);
  if (merged_choice == NULL)
    merged_choice = new BLOB_CHOICE_LIST;

  bool end_of_list = false;
  BLOB_CHOICE_IT merged_choice_it(merged_choice);
  while (!end_of_list) {
    // Find the maximum unichar_id of the current entry the iterators
    // are pointing at
    UNICHAR_ID max_unichar_id = choice_lists_it[0].data()->unichar_id();
    for (int i = 0; i < num_frag_parts; i++) {
      UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
      if (max_unichar_id < unichar_id) {
        max_unichar_id = unichar_id;
      }
    }

    // Move the each iterators until it gets to an entry that has a
    // value greater than or equal to max_unichar_id
    for (int i = 0; i < num_frag_parts; i++) {
      UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
      while (!choice_lists_it[i].cycled_list() &&
             unichar_id < max_unichar_id) {
        choice_lists_it[i].forward();
        unichar_id = choice_lists_it[i].data()->unichar_id();
      }
      if (choice_lists_it[i].cycled_list()) {
        end_of_list = true;
        break;
      }
    }

    if (end_of_list)
      break;

    // Checks if the fragments are parts of the same character
    UNICHAR_ID first_unichar_id = choice_lists_it[0].data()->unichar_id();
    bool same_unichar = true;
    for (int i = 1; i < num_frag_parts; i++) {
      UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
      if (unichar_id != first_unichar_id) {
        same_unichar = false;
        break;
      }
    }

    if (same_unichar) {
      // Add the merged character to the result
      UNICHAR_ID merged_unichar_id = first_unichar_id;
      GenericVector<ScoredFont> merged_fonts =
          choice_lists_it[0].data()->fonts();
      float merged_min_xheight = choice_lists_it[0].data()->min_xheight();
      float merged_max_xheight = choice_lists_it[0].data()->max_xheight();
      float positive_yshift = 0, negative_yshift = 0;
      int merged_script_id = choice_lists_it[0].data()->script_id();
      BlobChoiceClassifier classifier = choice_lists_it[0].data()->classifier();

      float merged_rating = 0, merged_certainty = 0;
      for (int i = 0; i < num_frag_parts; i++) {
        float rating = choice_lists_it[i].data()->rating();
        float certainty = choice_lists_it[i].data()->certainty();

        if (i == 0 || certainty < merged_certainty)
          merged_certainty = certainty;
        merged_rating += rating;

        choice_lists_it[i].forward();
        if (choice_lists_it[i].cycled_list())
          end_of_list = true;
        IntersectRange(choice_lists_it[i].data()->min_xheight(),
                       choice_lists_it[i].data()->max_xheight(),
                       &merged_min_xheight, &merged_max_xheight);
        float yshift = choice_lists_it[i].data()->yshift();
        if (yshift > positive_yshift) positive_yshift = yshift;
        if (yshift < negative_yshift) negative_yshift = yshift;
        // Use the min font rating over the parts.
        // TODO(rays) font lists are unsorted. Need to be faster?
        const GenericVector<ScoredFont>& frag_fonts =
            choice_lists_it[i].data()->fonts();
        for (int f = 0; f < frag_fonts.size(); ++f) {
          int merged_f = 0;
          for (merged_f = 0; merged_f < merged_fonts.size() &&
               merged_fonts[merged_f].fontinfo_id != frag_fonts[f].fontinfo_id;
               ++merged_f) {}
          if (merged_f == merged_fonts.size()) {
            merged_fonts.push_back(frag_fonts[f]);
          } else if (merged_fonts[merged_f].score > frag_fonts[f].score) {
            merged_fonts[merged_f].score = frag_fonts[f].score;
          }
        }
      }

      float merged_yshift = positive_yshift != 0
          ? (negative_yshift != 0 ? 0 : positive_yshift)
          : negative_yshift;
      BLOB_CHOICE* choice = new BLOB_CHOICE(merged_unichar_id,
                                            merged_rating,
                                            merged_certainty,
                                            merged_script_id,
                                            merged_min_xheight,
                                            merged_max_xheight,
                                            merged_yshift,
                                            classifier);
      choice->set_fonts(merged_fonts);
      merged_choice_it.add_to_end(choice);
    }
  }

  if (classify_debug_level)
    print_ratings_list("Merged Fragments", merged_choice,
                       unicharset);

  if (merged_choice->empty())
    delete merged_choice;
  else
    ratings->put(row, column, merged_choice);

  delete [] choice_lists_it;
}
Ejemplo n.º 14
0
void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
                        UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
                        MATRIX *ratings) {
  int num_blobs_to_replace = 0;
  int begin_blob_index = 0;
  int i;
  // Rating and certainty for the new BLOB_CHOICE are derived from the
  // replaced choices.
  float new_rating = 0.0f;
  float new_certainty = 0.0f;
  BLOB_CHOICE* old_choice = nullptr;
  for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
    if (i >= wrong_ngram_begin_index) {
      int num_blobs = werd_choice->state(i);
      int col = begin_blob_index + num_blobs_to_replace;
      int row = col + num_blobs - 1;
      BLOB_CHOICE_LIST* choices = ratings->get(col, row);
      ASSERT_HOST(choices != nullptr);
      old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices);
      ASSERT_HOST(old_choice != nullptr);
      new_rating += old_choice->rating();
      new_certainty += old_choice->certainty();
      num_blobs_to_replace += num_blobs;
    } else {
      begin_blob_index += werd_choice->state(i);
    }
  }
  new_certainty /= wrong_ngram_size;
  // If there is no entry in the ratings matrix, add it.
  MATRIX_COORD coord(begin_blob_index,
                     begin_blob_index + num_blobs_to_replace - 1);
  if (!coord.Valid(*ratings)) {
    ratings->IncreaseBandSize(coord.row - coord.col + 1);
  }
  if (ratings->get(coord.col, coord.row) == nullptr)
    ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST);
  BLOB_CHOICE_LIST* new_choices = ratings->get(coord.col, coord.row);
  BLOB_CHOICE* choice = FindMatchingChoice(correct_ngram_id, new_choices);
  if (choice != nullptr) {
    // Already there. Upgrade if new rating better.
    if (new_rating < choice->rating())
      choice->set_rating(new_rating);
    if (new_certainty < choice->certainty())
      choice->set_certainty(new_certainty);
    // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState.
  } else {
    // Need a new choice with the correct_ngram_id.
    choice = new BLOB_CHOICE(*old_choice);
    choice->set_unichar_id(correct_ngram_id);
    choice->set_rating(new_rating);
    choice->set_certainty(new_certainty);
    choice->set_classifier(BCC_AMBIG);
    choice->set_matrix_cell(coord.col, coord.row);
    BLOB_CHOICE_IT it (new_choices);
    it.add_to_end(choice);
  }
  // Remove current unichar from werd_choice. On the last iteration
  // set the correct replacement unichar instead of removing a unichar.
  for (int replaced_count = 0; replaced_count < wrong_ngram_size;
       ++replaced_count) {
    if (replaced_count + 1 == wrong_ngram_size) {
      werd_choice->set_blob_choice(wrong_ngram_begin_index,
                                   num_blobs_to_replace, choice);
    } else {
      werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1);
    }
  }
  if (stopper_debug_level >= 1) {
      werd_choice->print("ReplaceAmbig() ");
      tprintf("Modified blob_choices: ");
      print_ratings_list("\n", new_choices, getUnicharset());
  }
}
Ejemplo n.º 15
0
void LMPainPoints::GenerateFromPath(float rating_cert_scale,
                                    ViterbiStateEntry *vse,
                                    WERD_RES *word_res) {
  ViterbiStateEntry *curr_vse = vse;
  BLOB_CHOICE *curr_b = vse->curr_b;
  // The following pain point generation and priority calculation approaches
  // prioritize exploring paths with low average rating of the known part of
  // the path, while not relying on the ratings of the pieces to be combined.
  //
  // A pain point to combine the neighbors is generated for each pair of
  // neighboring blobs on the path (the path is represented by vse argument
  // given to GenerateFromPath()). The priority of each pain point is set to
  // the average rating (per outline length) of the path, not including the
  // ratings of the blobs to be combined.
  // The ratings of the blobs to be combined are not used to calculate the
  // priority, since it is not possible to determine from their magnitude
  // whether it will be beneficial to combine the blobs. The reason is that
  // chopped junk blobs (/ | - ') can have very good (low) ratings, however
  // combining them will be beneficial. Blobs with high ratings might be
  // over-joined pieces of characters, but also could be blobs from an unseen
  // font or chopped pieces of complex characters.
  while (curr_vse->parent_vse != NULL) {
    ViterbiStateEntry* parent_vse = curr_vse->parent_vse;
    const MATRIX_COORD& curr_cell = curr_b->matrix_cell();
    const MATRIX_COORD& parent_cell = parent_vse->curr_b->matrix_cell();
    MATRIX_COORD pain_coord(parent_cell.col, curr_cell.row);
    if (!pain_coord.Valid(*word_res->ratings) ||
        !word_res->ratings->Classified(parent_cell.col, curr_cell.row,
                                       dict_->WildcardID())) {
      // rat_subtr contains ratings sum of the two adjacent blobs to be merged.
      // rat_subtr will be subtracted from the ratings sum of the path, since
      // the blobs will be joined into a new blob, whose rating is yet unknown.
      float rat_subtr = curr_b->rating() + parent_vse->curr_b->rating();
      // ol_subtr contains the outline length of the blobs that will be joined.
      float ol_subtr =
          AssociateUtils::ComputeOutlineLength(rating_cert_scale, *curr_b) +
          AssociateUtils::ComputeOutlineLength(rating_cert_scale,
                                               *(parent_vse->curr_b));
      // ol_dif is the outline of the path without the two blobs to be joined.
      float ol_dif = vse->outline_length - ol_subtr;
      // priority is set to the average rating of the path per unit of outline,
      // not counting the ratings of the pieces to be joined.
      float priority = ol_dif > 0 ? (vse->ratings_sum-rat_subtr)/ol_dif : 0.0;
      GeneratePainPoint(pain_coord.col, pain_coord.row, LM_PPTYPE_PATH,
                        priority, true, max_char_wh_ratio_, word_res);
    } else if (debug_level_ > 3) {
      tprintf("NO pain point (Classified) for col=%d row=%d type=%s\n",
              pain_coord.col, pain_coord.row,
              LMPainPointsTypeName[LM_PPTYPE_PATH]);
      BLOB_CHOICE_IT b_it(word_res->ratings->get(pain_coord.col,
                                                 pain_coord.row));
      for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
        BLOB_CHOICE* choice = b_it.data();
        choice->print_full();
      }
    }

    curr_vse = parent_vse;
    curr_b = curr_vse->curr_b;
  }
}
Ejemplo n.º 16
0
/**
 * Return whether this is believable superscript or subscript text.
 *
 * We insist that:
 *   + there are no punctuation marks.
 *   + there are no italics.
 *   + no normal-sized character is smaller than superscript_scaledown_ratio
 *     of what it ought to be, and
 *   + each character is at least as certain as certainty_threshold.
 *
 *  @param[in]  debug  If true, spew debug output
 *  @param[in]  word   The word whose best_choice we're evaluating
 *  @param[in]  certainty_threshold   If any of the characters have less
 *                    certainty than this, reject.
 *  @param[out]  left_ok  How many left-side characters were ok?
 *  @param[out]  right_ok  How many right-side characters were ok?
 *  @return  Whether the complete best choice is believable as a superscript.
 */
bool Tesseract::BelievableSuperscript(bool debug,
                                      const WERD_RES &word,
                                      float certainty_threshold,
                                      int *left_ok,
                                      int *right_ok) const {
  int initial_ok_run_count = 0;
  int ok_run_count = 0;
  float worst_certainty = 0.0f;
  const WERD_CHOICE &wc = *word.best_choice;

  const UnicityTable<FontInfo>& fontinfo_table = get_fontinfo_table();
  for (int i = 0; i < wc.length(); i++) {
    TBLOB *blob = word.rebuild_word->blobs[i];
    UNICHAR_ID unichar_id = wc.unichar_id(i);
    float char_certainty = wc.certainty(i);
    bool bad_certainty = char_certainty < certainty_threshold;
    bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id);
    bool is_italic = word.fontinfo && word.fontinfo->is_italic();
    BLOB_CHOICE *choice = word.GetBlobChoice(i);
    if (choice && fontinfo_table.size() > 0) {
      // Get better information from the specific choice, if available.
      int font_id1 = choice->fontinfo_id();
      bool font1_is_italic = font_id1 >= 0
          ? fontinfo_table.get(font_id1).is_italic() : false;
      int font_id2 = choice->fontinfo_id2();
      is_italic = font1_is_italic &&
          (font_id2 < 0 || fontinfo_table.get(font_id2).is_italic());
    }

    float height_fraction = 1.0f;
    float char_height = blob->bounding_box().height();
    float normal_height = char_height;
    if (wc.unicharset()->top_bottom_useful()) {
      int min_bot, max_bot, min_top, max_top;
      wc.unicharset()->get_top_bottom(unichar_id,
                                      &min_bot, &max_bot,
                                      &min_top, &max_top);
      float hi_height = max_top - max_bot;
      float lo_height = min_top - min_bot;
      normal_height = (hi_height + lo_height) / 2;
      if (normal_height >= kBlnXHeight) {
        // Only ding characters that we have decent information for because
        // they're supposed to be normal sized, not tiny specks or dashes.
        height_fraction = char_height / normal_height;
      }
    }
    bool bad_height = height_fraction < superscript_scaledown_ratio;

    if (debug) {
      if (is_italic) {
        tprintf(" Rejecting: superscript is italic.\n");
      }
      if (is_punc) {
        tprintf(" Rejecting: punctuation present.\n");
      }
      const char *char_str = wc.unicharset()->id_to_unichar(unichar_id);
      if (bad_certainty) {
        tprintf(" Rejecting: don't believe character %s with certainty %.2f "
                "which is less than threshold %.2f\n", char_str,
                char_certainty, certainty_threshold);
      }
      if (bad_height) {
        tprintf(" Rejecting: character %s seems too small @ %.2f versus "
                "expected %.2f\n", char_str, char_height, normal_height);
      }
    }
    if (bad_certainty || bad_height || is_punc || is_italic) {
      if (ok_run_count == i) {
        initial_ok_run_count = ok_run_count;
      }
      ok_run_count = 0;
    } else {
      ok_run_count++;
    }
    if (char_certainty < worst_certainty) {
      worst_certainty = char_certainty;
    }
  }
  bool all_ok = ok_run_count == wc.length();
  if (all_ok && debug) {
    tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty);
  }
  if (!all_ok) {
    if (left_ok) *left_ok = initial_ok_run_count;
    if (right_ok) *right_ok = ok_run_count;
  }
  return all_ok;
}
Ejemplo n.º 17
0
/// Recursive helper to find a match to the target_text (from text_index
/// position) in the choices (from choices_pos position).
/// @param choices is an array of GenericVectors, of length choices_length,
/// with each element representing a starting position in the word, and the
/// #GenericVector holding classification results for a sequence of consecutive
/// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
/// @param choices_pos
/// @param choices_length
/// @param target_text
/// @param text_index
/// @param rating
/// @param segmentation
/// @param best_rating
/// @param best_segmentation
void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
                              int choices_pos, int choices_length,
                              const GenericVector<UNICHAR_ID>& target_text,
                              int text_index,
                              float rating, GenericVector<int>* segmentation,
                              float* best_rating,
                              GenericVector<int>* best_segmentation) {
  const UnicharAmbigsVector& table = getDict().getUnicharAmbigs().dang_ambigs();
  for (int length = 1; length <= choices[choices_pos].size(); ++length) {
    // Rating of matching choice or worst choice if no match.
    float choice_rating = 0.0f;
    // Find the corresponding best BLOB_CHOICE.
    BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
         choice_it.forward()) {
      BLOB_CHOICE* choice = choice_it.data();
      choice_rating = choice->rating();
      UNICHAR_ID class_id = choice->unichar_id();
      if (class_id == target_text[text_index]) {
        break;
      }
      // Search ambigs table.
      if (class_id < table.size() && table[class_id] != NULL) {
        AmbigSpec_IT spec_it(table[class_id]);
        for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
             spec_it.forward()) {
          const AmbigSpec *ambig_spec = spec_it.data();
          // We'll only do 1-1.
          if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
              ambig_spec->correct_ngram_id == target_text[text_index])
            break;
        }
        if (!spec_it.cycled_list())
          break;  // Found an ambig.
      }
    }
    if (choice_it.cycled_list())
      continue;  // No match.
    segmentation->push_back(length);
    if (choices_pos + length == choices_length &&
        text_index + 1 == target_text.size()) {
      // This is a complete match. If the rating is good record a new best.
      if (applybox_debug > 2) {
        tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
                rating + choice_rating, *best_rating, segmentation->size(),
                best_segmentation->size());
      }
      if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
        *best_segmentation = *segmentation;
        *best_rating = rating + choice_rating;
      }
    } else if (choices_pos + length < choices_length &&
               text_index + 1 < target_text.size()) {
      if (applybox_debug > 3) {
        tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n",
                target_text[text_index],
                unicharset.id_to_unichar(target_text[text_index]),
                choice_it.data()->unichar_id() == target_text[text_index]
                     ? "Match" : "Ambig",
                choices_pos, length);
      }
      SearchForText(choices, choices_pos + length, choices_length, target_text,
                    text_index + 1, rating + choice_rating, segmentation,
                    best_rating, best_segmentation);
      if (applybox_debug > 3) {
        tprintf("End recursion for %d=%s\n", target_text[text_index],
                unicharset.id_to_unichar(target_text[text_index]));
      }
    }
    segmentation->truncate(segmentation->size() - 1);
  }
}