/** * @name classify_blob * * Classify the this blob if it is not already recorded in the match * table. Attempt to recognize this blob as a character. The recognition * rating for this blob will be stored as a part of the blob. This value * will also be returned to the caller. * @param blob Current blob * @param string The string to display in ScrollView * @param color The colour to use when displayed with ScrollView */ BLOB_CHOICE_LIST *Wordrec::classify_blob(TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle) { #ifndef GRAPHICS_DISABLED if (wordrec_display_all_blobs) display_blob(blob, color); #endif // TODO(rays) collapse with call_matcher and move all to wordrec.cpp. BLOB_CHOICE_LIST* choices = call_matcher(blob); // If a blob with the same bounding box as one of the truth character // bounding boxes is not classified as the corresponding truth character // blame character classifier for incorrect answer. if (blamer_bundle != NULL) { blamer_bundle->BlameClassifier(getDict().getUnicharset(), blob->bounding_box(), *choices, wordrec_debug_blamer); } #ifndef GRAPHICS_DISABLED if (classify_debug_level && string) print_ratings_list(string, choices, getDict().getUnicharset()); if (wordrec_blob_pause) window_wait(blob_window); #endif return choices; }
void Wordrec::ProcessSegSearchPainPoint( float pain_point_priority, const MATRIX_COORD &pain_point, const char* pain_point_type, GenericVector<SegSearchPending>* pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle) { if (segsearch_debug_level > 0) { tprintf("Classifying pain point %s priority=%.4f, col=%d, row=%d\n", pain_point_type, pain_point_priority, pain_point.col, pain_point.row); } ASSERT_HOST(pain_points != NULL); MATRIX *ratings = word_res->ratings; // Classify blob [pain_point.col pain_point.row] if (!pain_point.Valid(*ratings)) { ratings->IncreaseBandSize(pain_point.row + 1 - pain_point.col); } ASSERT_HOST(pain_point.Valid(*ratings)); BLOB_CHOICE_LIST *classified = classify_piece(word_res->seam_array, pain_point.col, pain_point.row, pain_point_type, word_res->chopped_word, blamer_bundle); BLOB_CHOICE_LIST *lst = ratings->get(pain_point.col, pain_point.row); if (lst == NULL) { ratings->put(pain_point.col, pain_point.row, classified); } else { // We can not delete old BLOB_CHOICEs, since they might contain // ViterbiStateEntries that are parents of other "active" entries. // Thus if the matrix cell already contains classifications we add // the new ones to the beginning of the list. BLOB_CHOICE_IT it(lst); it.add_list_before(classified); delete classified; // safe to delete, since empty after add_list_before() classified = NULL; } if (segsearch_debug_level > 0) { print_ratings_list("Updated ratings matrix with a new entry:", ratings->get(pain_point.col, pain_point.row), getDict().getUnicharset()); ratings->print(getDict().getUnicharset()); } // Insert initial "pain points" to join the newly classified blob // with its left and right neighbors. if (classified != NULL && !classified->empty()) { if (pain_point.col > 0) { pain_points->GeneratePainPoint( pain_point.col - 1, pain_point.row, LM_PPTYPE_SHAPE, 0.0, true, segsearch_max_char_wh_ratio, word_res); } if (pain_point.row + 1 < ratings->dimension()) { pain_points->GeneratePainPoint( pain_point.col, pain_point.row + 1, LM_PPTYPE_SHAPE, 0.0, true, segsearch_max_char_wh_ratio, word_res); } } (*pending)[pain_point.col].SetBlobClassified(pain_point.row); }
/// Resegments the word to achieve the target_text from the classifier. /// Returns false if the re-segmentation fails. /// Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and /// applies a full search on the classifier results to find the best classified /// segmentation. As a compromise to obtain better recall, 1-1 ambiguity /// substitutions ARE used. bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text, WERD_RES* word_res) { // Classify all required combinations of blobs and save results in choices. int word_length = word_res->box_word->length(); GenericVector<BLOB_CHOICE_LIST*>* choices = new GenericVector<BLOB_CHOICE_LIST*>[word_length]; for (int i = 0; i < word_length; ++i) { for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) { BLOB_CHOICE_LIST* match_result = classify_piece( word_res->seam_array, i, i + j - 1, "Applybox", word_res->chopped_word, word_res->blamer_bundle); if (applybox_debug > 2) { tprintf("%d+%d:", i, j); print_ratings_list("Segment:", match_result, unicharset); } choices[i].push_back(match_result); } } // Search the segmentation graph for the target text. Must be an exact // match. Using wildcards makes it difficult to find the correct // segmentation even when it is there. word_res->best_state.clear(); GenericVector<int> search_segmentation; float best_rating = 0.0f; SearchForText(choices, 0, word_length, target_text, 0, 0.0f, &search_segmentation, &best_rating, &word_res->best_state); for (int i = 0; i < word_length; ++i) choices[i].delete_data_pointers(); delete [] choices; if (word_res->best_state.empty()) { // Build the original segmentation and if it is the same length as the // truth, assume it will do. int blob_count = 1; for (int s = 0; s < word_res->seam_array.size(); ++s) { SEAM* seam = word_res->seam_array[s]; if (!seam->HasAnySplits()) { word_res->best_state.push_back(blob_count); blob_count = 1; } else { ++blob_count; } } word_res->best_state.push_back(blob_count); if (word_res->best_state.size() != target_text.size()) { word_res->best_state.clear(); // No good. Original segmentation bad size. return false; } } word_res->correct_text.clear(); for (int i = 0; i < target_text.size(); ++i) { word_res->correct_text.push_back( STRING(unicharset.id_to_unichar(target_text[i]))); } return true; }
/** * print_char_choices_list */ void print_char_choices_list(const char *msg, const BLOB_CHOICE_LIST_VECTOR &char_choices, const UNICHARSET ¤t_unicharset, BOOL8 detailed) { if (*msg != '\0') tprintf("%s\n", msg); for (int x = 0; x < char_choices.length(); ++x) { BLOB_CHOICE_IT c_it; c_it.set_to_list(char_choices.get(x)); tprintf("char[%d]: %s\n", x, current_unicharset.debug_str( c_it.data()->unichar_id()).string()); if (detailed) print_ratings_list(" ", char_choices.get(x), current_unicharset); } }
// Creates a fake blob choice from the combination of the given fragments. // unichar is the class to be made from the combination, // expanded_fragment_lengths[choice_index] is the number of fragments to use. // old_choices[choice_index] has the classifier output for each fragment. // choice index initially indexes the last fragment and should be decremented // expanded_fragment_lengths[choice_index] times to get the earlier fragments. // Guarantees to return something non-null, or abort! BLOB_CHOICE* Wordrec::rebuild_fragments( const char* unichar, const char* expanded_fragment_lengths, int choice_index, BLOB_CHOICE_LIST_VECTOR *old_choices) { float rating = 0.0f; float certainty = 0.0f; inT16 min_xheight = -MAX_INT16; inT16 max_xheight = MAX_INT16; for (int fragment_pieces = expanded_fragment_lengths[choice_index] - 1; fragment_pieces >= 0; --fragment_pieces, --choice_index) { // Get a pointer to the classifier results from the old_choices. BLOB_CHOICE_LIST *current_choices = old_choices->get(choice_index); // Populate fragment with updated values and look for the // fragment with the same values in current_choices. // Update rating and certainty of the character being composed. CHAR_FRAGMENT fragment; fragment.set_all(unichar, fragment_pieces, expanded_fragment_lengths[choice_index], false); BLOB_CHOICE_IT choice_it(current_choices); for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { BLOB_CHOICE* choice = choice_it.data(); const CHAR_FRAGMENT *current_fragment = getDict().getUnicharset().get_fragment(choice->unichar_id()); if (current_fragment && fragment.equals(current_fragment)) { rating += choice->rating(); if (choice->certainty() < certainty) { certainty = choice->certainty(); } IntersectRange(choice->min_xheight(), choice->max_xheight(), &min_xheight, &max_xheight); break; } } if (choice_it.cycled_list()) { print_ratings_list("Failure", current_choices, unicharset); tprintf("Failed to find fragment %s at index=%d\n", fragment.to_string().string(), choice_index); } ASSERT_HOST(!choice_it.cycled_list()); // Be sure we found the fragment. } return new BLOB_CHOICE(getDict().getUnicharset().unichar_to_id(unichar), rating, certainty, -1, -1, 0, min_xheight, max_xheight, false); }
void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings) { int num_blobs_to_replace = 0; int begin_blob_index = 0; int i; // Rating and certainty for the new BLOB_CHOICE are derived from the // replaced choices. float new_rating = 0.0f; float new_certainty = 0.0f; BLOB_CHOICE* old_choice = nullptr; for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) { if (i >= wrong_ngram_begin_index) { int num_blobs = werd_choice->state(i); int col = begin_blob_index + num_blobs_to_replace; int row = col + num_blobs - 1; BLOB_CHOICE_LIST* choices = ratings->get(col, row); ASSERT_HOST(choices != nullptr); old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices); ASSERT_HOST(old_choice != nullptr); new_rating += old_choice->rating(); new_certainty += old_choice->certainty(); num_blobs_to_replace += num_blobs; } else { begin_blob_index += werd_choice->state(i); } } new_certainty /= wrong_ngram_size; // If there is no entry in the ratings matrix, add it. MATRIX_COORD coord(begin_blob_index, begin_blob_index + num_blobs_to_replace - 1); if (!coord.Valid(*ratings)) { ratings->IncreaseBandSize(coord.row - coord.col + 1); } if (ratings->get(coord.col, coord.row) == nullptr) ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST); BLOB_CHOICE_LIST* new_choices = ratings->get(coord.col, coord.row); BLOB_CHOICE* choice = FindMatchingChoice(correct_ngram_id, new_choices); if (choice != nullptr) { // Already there. Upgrade if new rating better. if (new_rating < choice->rating()) choice->set_rating(new_rating); if (new_certainty < choice->certainty()) choice->set_certainty(new_certainty); // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState. } else { // Need a new choice with the correct_ngram_id. choice = new BLOB_CHOICE(*old_choice); choice->set_unichar_id(correct_ngram_id); choice->set_rating(new_rating); choice->set_certainty(new_certainty); choice->set_classifier(BCC_AMBIG); choice->set_matrix_cell(coord.col, coord.row); BLOB_CHOICE_IT it (new_choices); it.add_to_end(choice); } // Remove current unichar from werd_choice. On the last iteration // set the correct replacement unichar instead of removing a unichar. for (int replaced_count = 0; replaced_count < wrong_ngram_size; ++replaced_count) { if (replaced_count + 1 == wrong_ngram_size) { werd_choice->set_blob_choice(wrong_ngram_begin_index, num_blobs_to_replace, choice); } else { werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1); } } if (stopper_debug_level >= 1) { werd_choice->print("ReplaceAmbig() "); tprintf("Modified blob_choices: "); print_ratings_list("\n", new_choices, getUnicharset()); } }
bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings) { if (stopper_debug_level > 2) { tprintf("\nRunning NoDangerousAmbig() for %s\n", best_choice->debug_string().string()); } // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities // for each unichar id in BestChoice. BLOB_CHOICE_LIST_VECTOR ambig_blob_choices; int i; bool ambigs_found = false; // For each position in best_choice: // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i] // -- initialize wrong_ngram with a single unichar_id at best_choice[i] // -- look for ambiguities corresponding to wrong_ngram in the list while // adding the following unichar_ids from best_choice to wrong_ngram // // Repeat the above procedure twice: first time look through // ambigs to be replaced and replace all the ambiguities found; // second time look through dangerous ambiguities and construct // ambig_blob_choices with fake a blob choice for each ambiguity // and pass them to dawg_permute_and_select() to search for // ambiguous words in the dictionaries. // // Note that during the execution of the for loop (on the first pass) // if replacements are made the length of best_choice might change. for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) { bool replace = (fix_replaceable && pass == 0); const UnicharAmbigsVector &table = replace ? getUnicharAmbigs().replace_ambigs() : getUnicharAmbigs().dang_ambigs(); if (!replace) { // Initialize ambig_blob_choices with lists containing a single // unichar id for the correspoding position in best_choice. // best_choice consisting from only the original letters will // have a rating of 0.0. for (i = 0; i < best_choice->length(); ++i) { BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST(); BLOB_CHOICE_IT lst_it(lst); // TODO(rays/antonova) Put real xheights and y shifts here. lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i), 0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG)); ambig_blob_choices.push_back(lst); } } UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1]; int wrong_ngram_index; int next_index; int blob_index = 0; for (i = 0; i < best_choice->length(); blob_index += best_choice->state(i), ++i) { UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i); if (stopper_debug_level > 2) { tprintf("Looking for %s ngrams starting with %s:\n", replace ? "replaceable" : "ambiguous", getUnicharset().debug_str(curr_unichar_id).string()); } int num_wrong_blobs = best_choice->state(i); wrong_ngram_index = 0; wrong_ngram[wrong_ngram_index] = curr_unichar_id; if (curr_unichar_id == INVALID_UNICHAR_ID || curr_unichar_id >= table.size() || table[curr_unichar_id] == nullptr) { continue; // there is no ambig spec for this unichar id } AmbigSpec_IT spec_it(table[curr_unichar_id]); for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) { const AmbigSpec *ambig_spec = spec_it.data(); wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID; int compare = UnicharIdArrayUtils::compare(wrong_ngram, ambig_spec->wrong_ngram); if (stopper_debug_level > 2) { tprintf("candidate ngram: "); UnicharIdArrayUtils::print(wrong_ngram, getUnicharset()); tprintf("current ngram from spec: "); UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset()); tprintf("comparison result: %d\n", compare); } if (compare == 0) { // Record the place where we found an ambiguity. if (fixpt != nullptr) { UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0]; fixpt->push_back(DANGERR_INFO( blob_index, blob_index + num_wrong_blobs, replace, getUnicharset().get_isngram(ambig_spec->correct_ngram_id), leftmost_id)); if (stopper_debug_level > 1) { tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index, blob_index + num_wrong_blobs, false, getUnicharset().get_isngram( ambig_spec->correct_ngram_id), getUnicharset().id_to_unichar(leftmost_id)); } } if (replace) { if (stopper_debug_level > 2) { tprintf("replace ambiguity with %s : ", getUnicharset().id_to_unichar( ambig_spec->correct_ngram_id)); UnicharIdArrayUtils::print( ambig_spec->correct_fragments, getUnicharset()); } ReplaceAmbig(i, ambig_spec->wrong_ngram_size, ambig_spec->correct_ngram_id, best_choice, ratings); } else if (i > 0 || ambig_spec->type != CASE_AMBIG) { // We found dang ambig - update ambig_blob_choices. if (stopper_debug_level > 2) { tprintf("found ambiguity: "); UnicharIdArrayUtils::print( ambig_spec->correct_fragments, getUnicharset()); } ambigs_found = true; for (int tmp_index = 0; tmp_index <= wrong_ngram_index; ++tmp_index) { // Add a blob choice for the corresponding fragment of the // ambiguity. These fake blob choices are initialized with // negative ratings (which are not possible for real blob // choices), so that dawg_permute_and_select() considers any // word not consisting of only the original letters a better // choice and stops searching for alternatives once such a // choice is found. BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]); bc_it.add_to_end(new BLOB_CHOICE( ambig_spec->correct_fragments[tmp_index], -1.0, 0.0, -1, 0, 1, 0, BCC_AMBIG)); } } spec_it.forward(); } else if (compare == -1) { if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size && ((next_index = wrong_ngram_index+1+i) < best_choice->length())) { // Add the next unichar id to wrong_ngram and keep looking for // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST. wrong_ngram[++wrong_ngram_index] = best_choice->unichar_id(next_index); num_wrong_blobs += best_choice->state(next_index); } else { break; // no more matching ambigs in this AMBIG_SPEC_LIST } } else { spec_it.forward(); } } // end searching AmbigSpec_LIST } // end searching best_choice } // end searching replace and dangerous ambigs // If any ambiguities were found permute the constructed ambig_blob_choices // to see if an alternative dictionary word can be found. if (ambigs_found) { if (stopper_debug_level > 2) { tprintf("\nResulting ambig_blob_choices:\n"); for (i = 0; i < ambig_blob_choices.length(); ++i) { print_ratings_list("", ambig_blob_choices.get(i), getUnicharset()); tprintf("\n"); } } WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0); ambigs_found = (alt_word->rating() < 0.0); if (ambigs_found) { if (stopper_debug_level >= 1) { tprintf ("Stopper: Possible ambiguous word = %s\n", alt_word->debug_string().string()); } if (fixpt != nullptr) { // Note: Currently character choices combined from fragments can only // be generated by NoDangrousAmbigs(). This code should be updated if // the capability to produce classifications combined from character // fragments is added to other functions. int orig_i = 0; for (i = 0; i < alt_word->length(); ++i) { const UNICHARSET &uchset = getUnicharset(); bool replacement_is_ngram = uchset.get_isngram(alt_word->unichar_id(i)); UNICHAR_ID leftmost_id = alt_word->unichar_id(i); if (replacement_is_ngram) { // we have to extract the leftmost unichar from the ngram. const char *str = uchset.id_to_unichar(leftmost_id); int step = uchset.step(str); if (step) leftmost_id = uchset.unichar_to_id(str, step); } int end_i = orig_i + alt_word->state(i); if (alt_word->state(i) > 1 || (orig_i + 1 == end_i && replacement_is_ngram)) { // Compute proper blob indices. int blob_start = 0; for (int j = 0; j < orig_i; ++j) blob_start += best_choice->state(j); int blob_end = blob_start; for (int j = orig_i; j < end_i; ++j) blob_end += best_choice->state(j); fixpt->push_back(DANGERR_INFO(blob_start, blob_end, true, replacement_is_ngram, leftmost_id)); if (stopper_debug_level > 1) { tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i, true, replacement_is_ngram, uchset.id_to_unichar(leftmost_id)); } } orig_i += alt_word->state(i); } } } delete alt_word; } if (output_ambig_words_file_ != nullptr) { fprintf(output_ambig_words_file_, "\n"); } ambig_blob_choices.delete_data_pointers(); return !ambigs_found; }
void Wordrec::SegSearch(CHUNKS_RECORD *chunks_record, WERD_CHOICE *best_choice, BLOB_CHOICE_LIST_VECTOR *best_char_choices, WERD_CHOICE *raw_choice, STATE *output_best_state) { int row, col = 0; if (segsearch_debug_level > 0) { tprintf("Starting SegSearch on ratings matrix:\n"); chunks_record->ratings->print(getDict().getUnicharset()); } // Start with a fresh best_choice since rating adjustments // used by the chopper and the new segmentation search are not compatible. best_choice->set_rating(WERD_CHOICE::kBadRating); // Clear best choice accumulator (that is used for adaption), so that // choices adjusted by chopper do not interfere with the results from the // segmentation search. getDict().ClearBestChoiceAccum(); MATRIX *ratings = chunks_record->ratings; // Priority queue containing pain points generated by the language model // The priority is set by the language model components, adjustments like // seam cost and width priority are factored into the priority. HEAP *pain_points = MakeHeap(segsearch_max_pain_points); // best_path_by_column records the lowest cost path found so far for each // column of the chunks_record->ratings matrix over all the rows. BestPathByColumn *best_path_by_column = new BestPathByColumn[ratings->dimension()]; for (col = 0; col < ratings->dimension(); ++col) { best_path_by_column[col].avg_cost = WERD_CHOICE::kBadRating; best_path_by_column[col].best_vse = NULL; } language_model_->InitForWord(prev_word_best_choice_, &denorm_, assume_fixed_pitch_char_segment, best_choice->certainty(), segsearch_max_char_wh_ratio, pain_points, chunks_record); MATRIX_COORD *pain_point; float pain_point_priority; BestChoiceBundle best_choice_bundle( output_best_state, best_choice, raw_choice, best_char_choices); // pending[i] stores a list of the parent/child pair of BLOB_CHOICE_LISTs, // where i is the column of the child. Initially all the classified entries // in the ratings matrix from column 0 (with parent NULL) are inserted into // pending[0]. As the language model state is updated, new child/parent // pairs are inserted into the lists. Next, the entries in pending[1] are // considered, and so on. It is important that during the update the // children are considered in the non-decreasing order of their column, since // this guarantess that all the parents would be up to date before an update // of a child is done. SEG_SEARCH_PENDING_LIST *pending = new SEG_SEARCH_PENDING_LIST[ratings->dimension()]; // Search for the ratings matrix for the initial best path. for (row = 0; row < ratings->dimension(); ++row) { if (ratings->get(0, row) != NOT_CLASSIFIED) { pending[0].add_sorted( SEG_SEARCH_PENDING::compare, true, new SEG_SEARCH_PENDING(row, NULL, LanguageModel::kAllChangedFlag)); } } UpdateSegSearchNodes(0, &pending, &best_path_by_column, chunks_record, pain_points, &best_choice_bundle); // Keep trying to find a better path by fixing the "pain points". int num_futile_classifications = 0; while (!(language_model_->AcceptableChoiceFound() || num_futile_classifications >= segsearch_max_futile_classifications)) { // Get the next valid "pain point". int pop; while (true) { pop = HeapPop(pain_points, &pain_point_priority, &pain_point); if (pop == EMPTY) break; if (pain_point->Valid(*ratings) && ratings->get(pain_point->col, pain_point->row) == NOT_CLASSIFIED) { break; } else { delete pain_point; } } if (pop == EMPTY) { if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n"); break; } if (segsearch_debug_level > 0) { tprintf("Classifying pain point priority=%.4f, col=%d, row=%d\n", pain_point_priority, pain_point->col, pain_point->row); } BLOB_CHOICE_LIST *classified = classify_piece( chunks_record->chunks, chunks_record->splits, pain_point->col, pain_point->row); ratings->put(pain_point->col, pain_point->row, classified); if (segsearch_debug_level > 0) { print_ratings_list("Updated ratings matrix with a new entry:", ratings->get(pain_point->col, pain_point->row), getDict().getUnicharset()); chunks_record->ratings->print(getDict().getUnicharset()); } // Insert initial "pain points" to join the newly classified blob // with its left and right neighbors. if (!classified->empty()) { float worst_piece_cert; bool fragmented; if (pain_point->col > 0) { language_model_->GetWorstPieceCertainty( pain_point->col-1, pain_point->row, chunks_record->ratings, &worst_piece_cert, &fragmented); language_model_->GeneratePainPoint( pain_point->col-1, pain_point->row, false, LanguageModel::kInitialPainPointPriorityAdjustment, worst_piece_cert, fragmented, best_choice->certainty(), segsearch_max_char_wh_ratio, NULL, NULL, chunks_record, pain_points); } if (pain_point->row+1 < ratings->dimension()) { language_model_->GetWorstPieceCertainty( pain_point->col, pain_point->row+1, chunks_record->ratings, &worst_piece_cert, &fragmented); language_model_->GeneratePainPoint( pain_point->col, pain_point->row+1, true, LanguageModel::kInitialPainPointPriorityAdjustment, worst_piece_cert, fragmented, best_choice->certainty(), segsearch_max_char_wh_ratio, NULL, NULL, chunks_record, pain_points); } } // Record a pending entry with the pain_point and each of its parents. int parent_row = pain_point->col - 1; if (parent_row < 0) { // this node has no parents pending[pain_point->col].add_sorted( SEG_SEARCH_PENDING::compare, true, new SEG_SEARCH_PENDING(pain_point->row, NULL, LanguageModel::kAllChangedFlag)); } else { for (int parent_col = 0; parent_col < pain_point->col; ++parent_col) { if (ratings->get(parent_col, parent_row) != NOT_CLASSIFIED) { pending[pain_point->col].add_sorted( SEG_SEARCH_PENDING::compare, true, new SEG_SEARCH_PENDING(pain_point->row, ratings->get(parent_col, parent_row), LanguageModel::kAllChangedFlag)); } } } UpdateSegSearchNodes(pain_point->col, &pending, &best_path_by_column, chunks_record, pain_points, &best_choice_bundle); if (!best_choice_bundle.updated) ++num_futile_classifications; if (segsearch_debug_level > 0) { tprintf("num_futile_classifications %d\n", num_futile_classifications); } // Clean up best_choice_bundle.updated = false; delete pain_point; // done using this pain point } if (segsearch_debug_level > 0) { tprintf("Done with SegSearch (AcceptableChoiceFound: %d\n", language_model_->AcceptableChoiceFound()); } // Clean up. FreeHeapData(pain_points, MATRIX_COORD::Delete); delete[] best_path_by_column; delete[] pending; for (row = 0; row < ratings->dimension(); ++row) { for (col = 0; col <= row; ++col) { BLOB_CHOICE_LIST *rating = ratings->get(col, row); if (rating != NOT_CLASSIFIED) language_model_->DeleteState(rating); } } }
/********************************************************************** * merge_and_put_fragment_lists * * Merge the fragment lists in choice_lists and append it to the * ratings matrix. **********************************************************************/ void Wordrec::merge_and_put_fragment_lists(inT16 row, inT16 column, inT16 num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings) { BLOB_CHOICE_IT *choice_lists_it = new BLOB_CHOICE_IT[num_frag_parts]; for (int i = 0; i < num_frag_parts; i++) { choice_lists_it[i].set_to_list(&choice_lists[i]); choice_lists_it[i].mark_cycle_pt(); } BLOB_CHOICE_LIST *merged_choice = ratings->get(row, column); if (merged_choice == NULL) merged_choice = new BLOB_CHOICE_LIST; bool end_of_list = false; BLOB_CHOICE_IT merged_choice_it(merged_choice); while (!end_of_list) { // Find the maximum unichar_id of the current entry the iterators // are pointing at UNICHAR_ID max_unichar_id = choice_lists_it[0].data()->unichar_id(); for (int i = 0; i < num_frag_parts; i++) { UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id(); if (max_unichar_id < unichar_id) { max_unichar_id = unichar_id; } } // Move the each iterators until it gets to an entry that has a // value greater than or equal to max_unichar_id for (int i = 0; i < num_frag_parts; i++) { UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id(); while (!choice_lists_it[i].cycled_list() && unichar_id < max_unichar_id) { choice_lists_it[i].forward(); unichar_id = choice_lists_it[i].data()->unichar_id(); } if (choice_lists_it[i].cycled_list()) { end_of_list = true; break; } } if (end_of_list) break; // Checks if the fragments are parts of the same character UNICHAR_ID first_unichar_id = choice_lists_it[0].data()->unichar_id(); bool same_unichar = true; for (int i = 1; i < num_frag_parts; i++) { UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id(); if (unichar_id != first_unichar_id) { same_unichar = false; break; } } if (same_unichar) { // Add the merged character to the result UNICHAR_ID merged_unichar_id = first_unichar_id; inT16 merged_fontinfo_id = choice_lists_it[0].data()->fontinfo_id(); inT16 merged_fontinfo_id2 = choice_lists_it[0].data()->fontinfo_id2(); float merged_min_xheight = choice_lists_it[0].data()->min_xheight(); float merged_max_xheight = choice_lists_it[0].data()->max_xheight(); float positive_yshift = 0, negative_yshift = 0; int merged_script_id = choice_lists_it[0].data()->script_id(); BlobChoiceClassifier classifier = choice_lists_it[0].data()->classifier(); float merged_rating = 0, merged_certainty = 0; for (int i = 0; i < num_frag_parts; i++) { float rating = choice_lists_it[i].data()->rating(); float certainty = choice_lists_it[i].data()->certainty(); if (i == 0 || certainty < merged_certainty) merged_certainty = certainty; merged_rating += rating; choice_lists_it[i].forward(); if (choice_lists_it[i].cycled_list()) end_of_list = true; IntersectRange(choice_lists_it[i].data()->min_xheight(), choice_lists_it[i].data()->max_xheight(), &merged_min_xheight, &merged_max_xheight); float yshift = choice_lists_it[i].data()->yshift(); if (yshift > positive_yshift) positive_yshift = yshift; if (yshift < negative_yshift) negative_yshift = yshift; } float merged_yshift = positive_yshift != 0 ? (negative_yshift != 0 ? 0 : positive_yshift) : negative_yshift; merged_choice_it.add_to_end(new BLOB_CHOICE(merged_unichar_id, merged_rating, merged_certainty, merged_fontinfo_id, merged_fontinfo_id2, merged_script_id, merged_min_xheight, merged_max_xheight, merged_yshift, classifier)); } } if (classify_debug_level) print_ratings_list("Merged Fragments", merged_choice, unicharset); if (merged_choice->empty()) delete merged_choice; else ratings->put(row, column, merged_choice); delete [] choice_lists_it; }