示例#1
0
/**
 * @name classify_blob
 *
 * Classify the this blob if it is not already recorded in the match
 * table. Attempt to recognize this blob as a character. The recognition
 * rating for this blob will be stored as a part of the blob. This value
 * will also be returned to the caller.
 * @param blob Current blob
 * @param string The string to display in ScrollView
 * @param color The colour to use when displayed with ScrollView
 */
BLOB_CHOICE_LIST *Wordrec::classify_blob(TBLOB *blob,
                                         const char *string, C_COL color,
                                         BlamerBundle *blamer_bundle) {
#ifndef GRAPHICS_DISABLED
  if (wordrec_display_all_blobs)
    display_blob(blob, color);
#endif
  // TODO(rays) collapse with call_matcher and move all to wordrec.cpp.
  BLOB_CHOICE_LIST* choices = call_matcher(blob);
  // If a blob with the same bounding box as one of the truth character
  // bounding boxes is not classified as the corresponding truth character
  // blame character classifier for incorrect answer.
  if (blamer_bundle != NULL) {
    blamer_bundle->BlameClassifier(getDict().getUnicharset(),
                                   blob->bounding_box(),
                                   *choices,
                                   wordrec_debug_blamer);
  }
  #ifndef GRAPHICS_DISABLED
  if (classify_debug_level && string)
    print_ratings_list(string, choices, getDict().getUnicharset());

  if (wordrec_blob_pause)
    window_wait(blob_window);
#endif

  return choices;
}
示例#2
0
void Wordrec::ProcessSegSearchPainPoint(
    float pain_point_priority,
    const MATRIX_COORD &pain_point, const char* pain_point_type,
    GenericVector<SegSearchPending>* pending, WERD_RES *word_res,
    LMPainPoints *pain_points, BlamerBundle *blamer_bundle) {
  if (segsearch_debug_level > 0) {
    tprintf("Classifying pain point %s priority=%.4f, col=%d, row=%d\n",
            pain_point_type, pain_point_priority,
            pain_point.col, pain_point.row);
  }
  ASSERT_HOST(pain_points != NULL);
  MATRIX *ratings = word_res->ratings;
  // Classify blob [pain_point.col pain_point.row]
  if (!pain_point.Valid(*ratings)) {
    ratings->IncreaseBandSize(pain_point.row + 1 - pain_point.col);
  }
  ASSERT_HOST(pain_point.Valid(*ratings));
  BLOB_CHOICE_LIST *classified = classify_piece(word_res->seam_array,
                                                pain_point.col, pain_point.row,
                                                pain_point_type,
                                                word_res->chopped_word,
                                                blamer_bundle);
  BLOB_CHOICE_LIST *lst = ratings->get(pain_point.col, pain_point.row);
  if (lst == NULL) {
    ratings->put(pain_point.col, pain_point.row, classified);
  } else {
    // We can not delete old BLOB_CHOICEs, since they might contain
    // ViterbiStateEntries that are parents of other "active" entries.
    // Thus if the matrix cell already contains classifications we add
    // the new ones to the beginning of the list.
    BLOB_CHOICE_IT it(lst);
    it.add_list_before(classified);
    delete classified;  // safe to delete, since empty after add_list_before()
    classified = NULL;
  }

  if (segsearch_debug_level > 0) {
    print_ratings_list("Updated ratings matrix with a new entry:",
                       ratings->get(pain_point.col, pain_point.row),
                       getDict().getUnicharset());
    ratings->print(getDict().getUnicharset());
  }

  // Insert initial "pain points" to join the newly classified blob
  // with its left and right neighbors.
  if (classified != NULL && !classified->empty()) {
    if (pain_point.col > 0) {
      pain_points->GeneratePainPoint(
          pain_point.col - 1, pain_point.row, LM_PPTYPE_SHAPE, 0.0,
          true, segsearch_max_char_wh_ratio, word_res);
    }
    if (pain_point.row + 1 < ratings->dimension()) {
      pain_points->GeneratePainPoint(
          pain_point.col, pain_point.row + 1, LM_PPTYPE_SHAPE, 0.0,
          true, segsearch_max_char_wh_ratio, word_res);
    }
  }
  (*pending)[pain_point.col].SetBlobClassified(pain_point.row);
}
示例#3
0
/// Resegments the word to achieve the target_text from the classifier.
/// Returns false if the re-segmentation fails.
/// Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and
/// applies a full search on the classifier results to find the best classified
/// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
/// substitutions ARE used.
bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
                                 WERD_RES* word_res) {
  // Classify all required combinations of blobs and save results in choices.
  int word_length = word_res->box_word->length();
  GenericVector<BLOB_CHOICE_LIST*>* choices =
      new GenericVector<BLOB_CHOICE_LIST*>[word_length];
  for (int i = 0; i < word_length; ++i) {
    for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
      BLOB_CHOICE_LIST* match_result = classify_piece(
          word_res->seam_array, i, i + j - 1, "Applybox",
          word_res->chopped_word, word_res->blamer_bundle);
      if (applybox_debug > 2) {
        tprintf("%d+%d:", i, j);
        print_ratings_list("Segment:", match_result, unicharset);
      }
      choices[i].push_back(match_result);
    }
  }
  // Search the segmentation graph for the target text. Must be an exact
  // match. Using wildcards makes it difficult to find the correct
  // segmentation even when it is there.
  word_res->best_state.clear();
  GenericVector<int> search_segmentation;
  float best_rating = 0.0f;
  SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
                &search_segmentation, &best_rating, &word_res->best_state);
  for (int i = 0; i < word_length; ++i)
    choices[i].delete_data_pointers();
  delete [] choices;
  if (word_res->best_state.empty()) {
    // Build the original segmentation and if it is the same length as the
    // truth, assume it will do.
    int blob_count = 1;
    for (int s = 0; s < word_res->seam_array.size(); ++s) {
      SEAM* seam = word_res->seam_array[s];
      if (!seam->HasAnySplits()) {
        word_res->best_state.push_back(blob_count);
        blob_count = 1;
      } else {
        ++blob_count;
      }
    }
    word_res->best_state.push_back(blob_count);
    if (word_res->best_state.size() != target_text.size()) {
      word_res->best_state.clear();  // No good. Original segmentation bad size.
      return false;
    }
  }
  word_res->correct_text.clear();
  for (int i = 0; i < target_text.size(); ++i) {
    word_res->correct_text.push_back(
        STRING(unicharset.id_to_unichar(target_text[i])));
  }
  return true;
}
/**
 * print_char_choices_list
 */
void print_char_choices_list(const char *msg,
                             const BLOB_CHOICE_LIST_VECTOR &char_choices,
                             const UNICHARSET &current_unicharset,
                             BOOL8 detailed) {
  if (*msg != '\0') tprintf("%s\n", msg);
  for (int x = 0; x < char_choices.length(); ++x) {
    BLOB_CHOICE_IT c_it;
    c_it.set_to_list(char_choices.get(x));
    tprintf("char[%d]: %s\n", x,
            current_unicharset.debug_str( c_it.data()->unichar_id()).string());
    if (detailed)
      print_ratings_list("  ", char_choices.get(x), current_unicharset);
  }
}
示例#5
0
// Creates a fake blob choice from the combination of the given fragments.
// unichar is the class to be made from the combination,
// expanded_fragment_lengths[choice_index] is the number of fragments to use.
// old_choices[choice_index] has the classifier output for each fragment.
// choice index initially indexes the last fragment and should be decremented
// expanded_fragment_lengths[choice_index] times to get the earlier fragments.
// Guarantees to return something non-null, or abort!
BLOB_CHOICE* Wordrec::rebuild_fragments(
    const char* unichar,
    const char* expanded_fragment_lengths,
    int choice_index,
    BLOB_CHOICE_LIST_VECTOR *old_choices) {
  float rating = 0.0f;
  float certainty = 0.0f;
  inT16 min_xheight = -MAX_INT16;
  inT16 max_xheight = MAX_INT16;
  for (int fragment_pieces = expanded_fragment_lengths[choice_index] - 1;
       fragment_pieces >= 0; --fragment_pieces, --choice_index) {
    // Get a pointer to the classifier results from the old_choices.
    BLOB_CHOICE_LIST *current_choices = old_choices->get(choice_index);
    // Populate fragment with updated values and look for the
    // fragment with the same values in current_choices.
    // Update rating and certainty of the character being composed.
    CHAR_FRAGMENT fragment;
    fragment.set_all(unichar, fragment_pieces,
                     expanded_fragment_lengths[choice_index], false);
    BLOB_CHOICE_IT choice_it(current_choices);
    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
        choice_it.forward()) {
      BLOB_CHOICE* choice = choice_it.data();
      const CHAR_FRAGMENT *current_fragment =
          getDict().getUnicharset().get_fragment(choice->unichar_id());
      if (current_fragment && fragment.equals(current_fragment)) {
        rating += choice->rating();
        if (choice->certainty() < certainty) {
          certainty = choice->certainty();
        }
        IntersectRange(choice->min_xheight(), choice->max_xheight(),
                       &min_xheight, &max_xheight);
        break;
      }
    }
    if (choice_it.cycled_list()) {
      print_ratings_list("Failure", current_choices, unicharset);
      tprintf("Failed to find fragment %s at index=%d\n",
              fragment.to_string().string(), choice_index);
    }
    ASSERT_HOST(!choice_it.cycled_list());  // Be sure we found the fragment.
  }
  return new BLOB_CHOICE(getDict().getUnicharset().unichar_to_id(unichar),
                         rating, certainty, -1, -1, 0,
                         min_xheight, max_xheight, false);
}
示例#6
0
void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
                        UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
                        MATRIX *ratings) {
  int num_blobs_to_replace = 0;
  int begin_blob_index = 0;
  int i;
  // Rating and certainty for the new BLOB_CHOICE are derived from the
  // replaced choices.
  float new_rating = 0.0f;
  float new_certainty = 0.0f;
  BLOB_CHOICE* old_choice = nullptr;
  for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
    if (i >= wrong_ngram_begin_index) {
      int num_blobs = werd_choice->state(i);
      int col = begin_blob_index + num_blobs_to_replace;
      int row = col + num_blobs - 1;
      BLOB_CHOICE_LIST* choices = ratings->get(col, row);
      ASSERT_HOST(choices != nullptr);
      old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices);
      ASSERT_HOST(old_choice != nullptr);
      new_rating += old_choice->rating();
      new_certainty += old_choice->certainty();
      num_blobs_to_replace += num_blobs;
    } else {
      begin_blob_index += werd_choice->state(i);
    }
  }
  new_certainty /= wrong_ngram_size;
  // If there is no entry in the ratings matrix, add it.
  MATRIX_COORD coord(begin_blob_index,
                     begin_blob_index + num_blobs_to_replace - 1);
  if (!coord.Valid(*ratings)) {
    ratings->IncreaseBandSize(coord.row - coord.col + 1);
  }
  if (ratings->get(coord.col, coord.row) == nullptr)
    ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST);
  BLOB_CHOICE_LIST* new_choices = ratings->get(coord.col, coord.row);
  BLOB_CHOICE* choice = FindMatchingChoice(correct_ngram_id, new_choices);
  if (choice != nullptr) {
    // Already there. Upgrade if new rating better.
    if (new_rating < choice->rating())
      choice->set_rating(new_rating);
    if (new_certainty < choice->certainty())
      choice->set_certainty(new_certainty);
    // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState.
  } else {
    // Need a new choice with the correct_ngram_id.
    choice = new BLOB_CHOICE(*old_choice);
    choice->set_unichar_id(correct_ngram_id);
    choice->set_rating(new_rating);
    choice->set_certainty(new_certainty);
    choice->set_classifier(BCC_AMBIG);
    choice->set_matrix_cell(coord.col, coord.row);
    BLOB_CHOICE_IT it (new_choices);
    it.add_to_end(choice);
  }
  // Remove current unichar from werd_choice. On the last iteration
  // set the correct replacement unichar instead of removing a unichar.
  for (int replaced_count = 0; replaced_count < wrong_ngram_size;
       ++replaced_count) {
    if (replaced_count + 1 == wrong_ngram_size) {
      werd_choice->set_blob_choice(wrong_ngram_begin_index,
                                   num_blobs_to_replace, choice);
    } else {
      werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1);
    }
  }
  if (stopper_debug_level >= 1) {
      werd_choice->print("ReplaceAmbig() ");
      tprintf("Modified blob_choices: ");
      print_ratings_list("\n", new_choices, getUnicharset());
  }
}
示例#7
0
bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
                            DANGERR *fixpt,
                            bool fix_replaceable,
                            MATRIX *ratings) {
  if (stopper_debug_level > 2) {
    tprintf("\nRunning NoDangerousAmbig() for %s\n",
            best_choice->debug_string().string());
  }

  // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
  // for each unichar id in BestChoice.
  BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
  int i;
  bool ambigs_found = false;
  // For each position in best_choice:
  // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
  // -- initialize wrong_ngram with a single unichar_id at best_choice[i]
  // -- look for ambiguities corresponding to wrong_ngram in the list while
  //    adding the following unichar_ids from best_choice to wrong_ngram
  //
  // Repeat the above procedure twice: first time look through
  // ambigs to be replaced and replace all the ambiguities found;
  // second time look through dangerous ambiguities and construct
  // ambig_blob_choices with fake a blob choice for each ambiguity
  // and pass them to dawg_permute_and_select() to search for
  // ambiguous words in the dictionaries.
  //
  // Note that during the execution of the for loop (on the first pass)
  // if replacements are made the length of best_choice might change.
  for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
    bool replace = (fix_replaceable && pass == 0);
    const UnicharAmbigsVector &table = replace ?
      getUnicharAmbigs().replace_ambigs() : getUnicharAmbigs().dang_ambigs();
    if (!replace) {
      // Initialize ambig_blob_choices with lists containing a single
      // unichar id for the correspoding position in best_choice.
      // best_choice consisting from only the original letters will
      // have a rating of 0.0.
      for (i = 0; i < best_choice->length(); ++i) {
        BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST();
        BLOB_CHOICE_IT lst_it(lst);
        // TODO(rays/antonova) Put real xheights and y shifts here.
        lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i),
                                          0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG));
        ambig_blob_choices.push_back(lst);
      }
    }
    UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
    int wrong_ngram_index;
    int next_index;
    int blob_index = 0;
    for (i = 0; i < best_choice->length(); blob_index += best_choice->state(i),
         ++i) {
      UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i);
      if (stopper_debug_level > 2) {
        tprintf("Looking for %s ngrams starting with %s:\n",
                replace ? "replaceable" : "ambiguous",
                getUnicharset().debug_str(curr_unichar_id).string());
      }
      int num_wrong_blobs = best_choice->state(i);
      wrong_ngram_index = 0;
      wrong_ngram[wrong_ngram_index] = curr_unichar_id;
      if (curr_unichar_id == INVALID_UNICHAR_ID ||
          curr_unichar_id >= table.size() ||
          table[curr_unichar_id] == nullptr) {
        continue;  // there is no ambig spec for this unichar id
      }
      AmbigSpec_IT spec_it(table[curr_unichar_id]);
      for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
        const AmbigSpec *ambig_spec = spec_it.data();
        wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID;
        int compare = UnicharIdArrayUtils::compare(wrong_ngram,
                                                   ambig_spec->wrong_ngram);
        if (stopper_debug_level > 2) {
          tprintf("candidate ngram: ");
          UnicharIdArrayUtils::print(wrong_ngram, getUnicharset());
          tprintf("current ngram from spec: ");
          UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());
          tprintf("comparison result: %d\n", compare);
        }
        if (compare == 0) {
          // Record the place where we found an ambiguity.
          if (fixpt != nullptr) {
            UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0];
            fixpt->push_back(DANGERR_INFO(
                blob_index, blob_index + num_wrong_blobs, replace,
                getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
                leftmost_id));
            if (stopper_debug_level > 1) {
              tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index,
                      blob_index + num_wrong_blobs, false,
                      getUnicharset().get_isngram(
                          ambig_spec->correct_ngram_id),
                      getUnicharset().id_to_unichar(leftmost_id));
            }
          }

          if (replace) {
            if (stopper_debug_level > 2) {
              tprintf("replace ambiguity with %s : ",
                      getUnicharset().id_to_unichar(
                          ambig_spec->correct_ngram_id));
              UnicharIdArrayUtils::print(
                  ambig_spec->correct_fragments, getUnicharset());
            }
            ReplaceAmbig(i, ambig_spec->wrong_ngram_size,
                         ambig_spec->correct_ngram_id,
                         best_choice, ratings);
          } else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
            // We found dang ambig - update ambig_blob_choices.
            if (stopper_debug_level > 2) {
              tprintf("found ambiguity: ");
              UnicharIdArrayUtils::print(
                  ambig_spec->correct_fragments, getUnicharset());
            }
            ambigs_found = true;
            for (int tmp_index = 0; tmp_index <= wrong_ngram_index;
                 ++tmp_index) {
              // Add a blob choice for the corresponding fragment of the
              // ambiguity. These fake blob choices are initialized with
              // negative ratings (which are not possible for real blob
              // choices), so that dawg_permute_and_select() considers any
              // word not consisting of only the original letters a better
              // choice and stops searching for alternatives once such a
              // choice is found.
              BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
              bc_it.add_to_end(new BLOB_CHOICE(
                  ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
                  -1, 0, 1, 0, BCC_AMBIG));
            }
          }
          spec_it.forward();
        } else if (compare == -1) {
          if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size &&
              ((next_index = wrong_ngram_index+1+i) < best_choice->length())) {
            // Add the next unichar id to wrong_ngram and keep looking for
            // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
            wrong_ngram[++wrong_ngram_index] =
              best_choice->unichar_id(next_index);
            num_wrong_blobs += best_choice->state(next_index);
          } else {
            break;  // no more matching ambigs in this AMBIG_SPEC_LIST
          }
        } else {
          spec_it.forward();
        }
      }  // end searching AmbigSpec_LIST
    }  // end searching best_choice
  }  // end searching replace and dangerous ambigs

  // If any ambiguities were found permute the constructed ambig_blob_choices
  // to see if an alternative dictionary word can be found.
  if (ambigs_found) {
    if (stopper_debug_level > 2) {
      tprintf("\nResulting ambig_blob_choices:\n");
      for (i = 0; i < ambig_blob_choices.length(); ++i) {
        print_ratings_list("", ambig_blob_choices.get(i), getUnicharset());
        tprintf("\n");
      }
    }
    WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
    ambigs_found = (alt_word->rating() < 0.0);
    if (ambigs_found) {
      if (stopper_debug_level >= 1) {
        tprintf ("Stopper: Possible ambiguous word = %s\n",
                 alt_word->debug_string().string());
      }
      if (fixpt != nullptr) {
        // Note: Currently character choices combined from fragments can only
        // be generated by NoDangrousAmbigs(). This code should be updated if
        // the capability to produce classifications combined from character
        // fragments is added to other functions.
        int orig_i = 0;
        for (i = 0; i < alt_word->length(); ++i) {
          const UNICHARSET &uchset = getUnicharset();
          bool replacement_is_ngram =
              uchset.get_isngram(alt_word->unichar_id(i));
          UNICHAR_ID leftmost_id = alt_word->unichar_id(i);
          if (replacement_is_ngram) {
            // we have to extract the leftmost unichar from the ngram.
            const char *str = uchset.id_to_unichar(leftmost_id);
            int step = uchset.step(str);
            if (step) leftmost_id = uchset.unichar_to_id(str, step);
          }
          int end_i = orig_i + alt_word->state(i);
          if (alt_word->state(i) > 1 ||
              (orig_i + 1 == end_i && replacement_is_ngram)) {
            // Compute proper blob indices.
            int blob_start = 0;
            for (int j = 0; j < orig_i; ++j)
              blob_start += best_choice->state(j);
            int blob_end = blob_start;
            for (int j = orig_i; j < end_i; ++j)
              blob_end += best_choice->state(j);
            fixpt->push_back(DANGERR_INFO(blob_start, blob_end, true,
                                          replacement_is_ngram, leftmost_id));
            if (stopper_debug_level > 1) {
              tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i,
                      true, replacement_is_ngram,
                      uchset.id_to_unichar(leftmost_id));
            }
          }
          orig_i += alt_word->state(i);
        }
      }
    }
    delete alt_word;
  }
  if (output_ambig_words_file_ != nullptr) {
    fprintf(output_ambig_words_file_, "\n");
  }

  ambig_blob_choices.delete_data_pointers();
  return !ambigs_found;
}
示例#8
0
void Wordrec::SegSearch(CHUNKS_RECORD *chunks_record,
                        WERD_CHOICE *best_choice,
                        BLOB_CHOICE_LIST_VECTOR *best_char_choices,
                        WERD_CHOICE *raw_choice,
                        STATE *output_best_state) {
  int row, col = 0;
  if (segsearch_debug_level > 0) {
    tprintf("Starting SegSearch on ratings matrix:\n");
    chunks_record->ratings->print(getDict().getUnicharset());
  }
  // Start with a fresh best_choice since rating adjustments
  // used by the chopper and the new segmentation search are not compatible.
  best_choice->set_rating(WERD_CHOICE::kBadRating);
  // Clear best choice accumulator (that is used for adaption), so that
  // choices adjusted by chopper do not interfere with the results from the
  // segmentation search.
  getDict().ClearBestChoiceAccum();

  MATRIX *ratings = chunks_record->ratings;
  // Priority queue containing pain points generated by the language model
  // The priority is set by the language model components, adjustments like
  // seam cost and width priority are factored into the priority.
  HEAP *pain_points = MakeHeap(segsearch_max_pain_points);

  // best_path_by_column records the lowest cost path found so far for each
  // column of the chunks_record->ratings matrix over all the rows.
  BestPathByColumn *best_path_by_column =
    new BestPathByColumn[ratings->dimension()];
  for (col = 0; col < ratings->dimension(); ++col) {
    best_path_by_column[col].avg_cost = WERD_CHOICE::kBadRating;
    best_path_by_column[col].best_vse = NULL;
  }

  language_model_->InitForWord(prev_word_best_choice_, &denorm_,
                               assume_fixed_pitch_char_segment,
                               best_choice->certainty(),
                               segsearch_max_char_wh_ratio,
                               pain_points, chunks_record);

  MATRIX_COORD *pain_point;
  float pain_point_priority;
  BestChoiceBundle best_choice_bundle(
      output_best_state, best_choice, raw_choice, best_char_choices);

  // pending[i] stores a list of the parent/child pair of BLOB_CHOICE_LISTs,
  // where i is the column of the child. Initially all the classified entries
  // in the ratings matrix from column 0 (with parent NULL) are inserted into
  // pending[0]. As the language model state is updated, new child/parent
  // pairs are inserted into the lists. Next, the entries in pending[1] are
  // considered, and so on. It is important that during the update the
  // children are considered in the non-decreasing order of their column, since
  // this guarantess that all the parents would be up to date before an update
  // of a child is done.
  SEG_SEARCH_PENDING_LIST *pending =
    new SEG_SEARCH_PENDING_LIST[ratings->dimension()];

  // Search for the ratings matrix for the initial best path.
  for (row = 0; row < ratings->dimension(); ++row) {
    if (ratings->get(0, row) != NOT_CLASSIFIED) {
      pending[0].add_sorted(
          SEG_SEARCH_PENDING::compare, true,
          new SEG_SEARCH_PENDING(row, NULL, LanguageModel::kAllChangedFlag));
    }
  }
  UpdateSegSearchNodes(0, &pending, &best_path_by_column, chunks_record,
                       pain_points, &best_choice_bundle);

  // Keep trying to find a better path by fixing the "pain points".
  int num_futile_classifications = 0;
  while (!(language_model_->AcceptableChoiceFound() ||
           num_futile_classifications >=
           segsearch_max_futile_classifications)) {
    // Get the next valid "pain point".
    int pop;
    while (true) {
      pop = HeapPop(pain_points, &pain_point_priority, &pain_point);
      if (pop == EMPTY) break;
      if (pain_point->Valid(*ratings) &&
        ratings->get(pain_point->col, pain_point->row) == NOT_CLASSIFIED) {
        break;
      } else {
        delete pain_point;
      }
    }
    if (pop == EMPTY) {
      if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n");
      break;
    }
    if (segsearch_debug_level > 0) {
      tprintf("Classifying pain point priority=%.4f, col=%d, row=%d\n",
              pain_point_priority, pain_point->col, pain_point->row);
    }
    BLOB_CHOICE_LIST *classified = classify_piece(
        chunks_record->chunks, chunks_record->splits,
        pain_point->col, pain_point->row);
    ratings->put(pain_point->col, pain_point->row, classified);

    if (segsearch_debug_level > 0) {
      print_ratings_list("Updated ratings matrix with a new entry:",
                         ratings->get(pain_point->col, pain_point->row),
                         getDict().getUnicharset());
      chunks_record->ratings->print(getDict().getUnicharset());
    }

    // Insert initial "pain points" to join the newly classified blob
    // with its left and right neighbors.
    if (!classified->empty()) {
      float worst_piece_cert;
      bool fragmented;
      if (pain_point->col > 0) {
        language_model_->GetWorstPieceCertainty(
            pain_point->col-1, pain_point->row, chunks_record->ratings,
            &worst_piece_cert, &fragmented);
        language_model_->GeneratePainPoint(
            pain_point->col-1, pain_point->row, false,
            LanguageModel::kInitialPainPointPriorityAdjustment,
            worst_piece_cert, fragmented, best_choice->certainty(),
            segsearch_max_char_wh_ratio, NULL, NULL,
            chunks_record, pain_points);
      }
      if (pain_point->row+1 < ratings->dimension()) {
        language_model_->GetWorstPieceCertainty(
            pain_point->col, pain_point->row+1, chunks_record->ratings,
            &worst_piece_cert, &fragmented);
        language_model_->GeneratePainPoint(
            pain_point->col, pain_point->row+1, true,
            LanguageModel::kInitialPainPointPriorityAdjustment,
            worst_piece_cert, fragmented, best_choice->certainty(),
            segsearch_max_char_wh_ratio, NULL, NULL,
            chunks_record, pain_points);
      }
    }

    // Record a pending entry with the pain_point and each of its parents.
    int parent_row = pain_point->col - 1;
    if (parent_row < 0) {  // this node has no parents
      pending[pain_point->col].add_sorted(
          SEG_SEARCH_PENDING::compare, true,
          new SEG_SEARCH_PENDING(pain_point->row, NULL,
                                 LanguageModel::kAllChangedFlag));
    } else {
      for (int parent_col = 0; parent_col < pain_point->col; ++parent_col) {
        if (ratings->get(parent_col, parent_row) != NOT_CLASSIFIED) {
          pending[pain_point->col].add_sorted(
              SEG_SEARCH_PENDING::compare, true,
              new SEG_SEARCH_PENDING(pain_point->row,
                                     ratings->get(parent_col, parent_row),
                                     LanguageModel::kAllChangedFlag));
        }
      }
    }
    UpdateSegSearchNodes(pain_point->col, &pending, &best_path_by_column,
                         chunks_record, pain_points, &best_choice_bundle);
    if (!best_choice_bundle.updated) ++num_futile_classifications;

    if (segsearch_debug_level > 0) {
      tprintf("num_futile_classifications %d\n", num_futile_classifications);
    }

    // Clean up
    best_choice_bundle.updated = false;
    delete pain_point;  // done using this pain point
  }

  if (segsearch_debug_level > 0) {
    tprintf("Done with SegSearch (AcceptableChoiceFound: %d\n",
            language_model_->AcceptableChoiceFound());
  }

  // Clean up.
  FreeHeapData(pain_points, MATRIX_COORD::Delete);
  delete[] best_path_by_column;
  delete[] pending;
  for (row = 0; row < ratings->dimension(); ++row) {
    for (col = 0; col <= row; ++col) {
      BLOB_CHOICE_LIST *rating = ratings->get(col, row);
      if (rating != NOT_CLASSIFIED) language_model_->DeleteState(rating);
    }
  }
}
示例#9
0
/**********************************************************************
 * merge_and_put_fragment_lists
 *
 * Merge the fragment lists in choice_lists and append it to the
 * ratings matrix.
 **********************************************************************/
void Wordrec::merge_and_put_fragment_lists(inT16 row, inT16 column,
                                           inT16 num_frag_parts,
                                           BLOB_CHOICE_LIST *choice_lists,
                                           MATRIX *ratings) {
  BLOB_CHOICE_IT *choice_lists_it = new BLOB_CHOICE_IT[num_frag_parts];

  for (int i = 0; i < num_frag_parts; i++) {
    choice_lists_it[i].set_to_list(&choice_lists[i]);
    choice_lists_it[i].mark_cycle_pt();
  }

  BLOB_CHOICE_LIST *merged_choice = ratings->get(row, column);
  if (merged_choice == NULL)
    merged_choice = new BLOB_CHOICE_LIST;

  bool end_of_list = false;
  BLOB_CHOICE_IT merged_choice_it(merged_choice);
  while (!end_of_list) {
    // Find the maximum unichar_id of the current entry the iterators
    // are pointing at
    UNICHAR_ID max_unichar_id = choice_lists_it[0].data()->unichar_id();
    for (int i = 0; i < num_frag_parts; i++) {
      UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
      if (max_unichar_id < unichar_id) {
        max_unichar_id = unichar_id;
      }
    }

    // Move the each iterators until it gets to an entry that has a
    // value greater than or equal to max_unichar_id
    for (int i = 0; i < num_frag_parts; i++) {
      UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
      while (!choice_lists_it[i].cycled_list() &&
             unichar_id < max_unichar_id) {
        choice_lists_it[i].forward();
        unichar_id = choice_lists_it[i].data()->unichar_id();
      }
      if (choice_lists_it[i].cycled_list()) {
        end_of_list = true;
        break;
      }
    }

    if (end_of_list)
      break;

    // Checks if the fragments are parts of the same character
    UNICHAR_ID first_unichar_id = choice_lists_it[0].data()->unichar_id();
    bool same_unichar = true;
    for (int i = 1; i < num_frag_parts; i++) {
      UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
      if (unichar_id != first_unichar_id) {
        same_unichar = false;
        break;
      }
    }

    if (same_unichar) {
      // Add the merged character to the result
      UNICHAR_ID merged_unichar_id = first_unichar_id;
      inT16 merged_fontinfo_id = choice_lists_it[0].data()->fontinfo_id();
      inT16 merged_fontinfo_id2 = choice_lists_it[0].data()->fontinfo_id2();
      float merged_min_xheight = choice_lists_it[0].data()->min_xheight();
      float merged_max_xheight = choice_lists_it[0].data()->max_xheight();
      float positive_yshift = 0, negative_yshift = 0;
      int merged_script_id = choice_lists_it[0].data()->script_id();
      BlobChoiceClassifier classifier = choice_lists_it[0].data()->classifier();

      float merged_rating = 0, merged_certainty = 0;
      for (int i = 0; i < num_frag_parts; i++) {
        float rating = choice_lists_it[i].data()->rating();
        float certainty = choice_lists_it[i].data()->certainty();

        if (i == 0 || certainty < merged_certainty)
          merged_certainty = certainty;
        merged_rating += rating;

        choice_lists_it[i].forward();
        if (choice_lists_it[i].cycled_list())
          end_of_list = true;
        IntersectRange(choice_lists_it[i].data()->min_xheight(),
                       choice_lists_it[i].data()->max_xheight(),
                       &merged_min_xheight, &merged_max_xheight);
        float yshift = choice_lists_it[i].data()->yshift();
        if (yshift > positive_yshift) positive_yshift = yshift;
        if (yshift < negative_yshift) negative_yshift = yshift;
      }

      float merged_yshift = positive_yshift != 0
          ? (negative_yshift != 0 ? 0 : positive_yshift)
          : negative_yshift;
      merged_choice_it.add_to_end(new BLOB_CHOICE(merged_unichar_id,
                                                  merged_rating,
                                                  merged_certainty,
                                                  merged_fontinfo_id,
                                                  merged_fontinfo_id2,
                                                  merged_script_id,
                                                  merged_min_xheight,
                                                  merged_max_xheight,
                                                  merged_yshift,
                                                  classifier));
    }
  }

  if (classify_debug_level)
    print_ratings_list("Merged Fragments", merged_choice,
                       unicharset);

  if (merged_choice->empty())
    delete merged_choice;
  else
    ratings->put(row, column, merged_choice);

  delete [] choice_lists_it;
}