/********************************************************************** * blobs_widths * * Compute the widths of a list of blobs. Return an array of the widths * and gaps. **********************************************************************/ WIDTH_RECORD *blobs_widths(TBLOB *blobs) { /*blob to compute on */ WIDTH_RECORD *width_record; TPOINT topleft; /*bounding box */ TPOINT botright; int i = 0; int blob_end; int num_blobs = count_blobs (blobs); /* Get memory */ width_record = (WIDTH_RECORD *) memalloc (sizeof (int) * num_blobs * 2); width_record->num_chars = num_blobs; blob_bounding_box(blobs, &topleft, &botright); width_record->widths[i++] = botright.x - topleft.x; /* First width */ blob_end = botright.x; for (TBLOB* blob = blobs->next; blob != NULL; blob = blob->next) { blob_bounding_box(blob, &topleft, &botright); width_record->widths[i++] = topleft.x - blob_end; width_record->widths[i++] = botright.x - topleft.x; blob_end = botright.x; } return (width_record); }
/** * @name improve_by_chopping * * Start with the current word of blobs and its classification. Find * the worst blobs and try to divide them up to improve the ratings. * As long as ratings are produced by the new blob splitting. When * all the splitting has been accomplished all the ratings memory is * reclaimed. */ void Wordrec::improve_by_chopping(WERD_RES *word, BLOB_CHOICE_LIST_VECTOR *char_choices, STATE *best_state, BLOB_CHOICE_LIST_VECTOR *best_char_choices, DANGERR *fixpt, bool *best_choice_acceptable) { inT32 blob_number; float old_best; int fixpt_valid = 1; bool updated_best_choice = false; while (1) { // improvement loop if (!fixpt_valid) fixpt->clear(); old_best = word->best_choice->rating(); if (improve_one_blob(word->chopped_word, char_choices, &blob_number, &word->seam_array, fixpt, (fragments_guide_chopper && word->best_choice->fragment_mark()))) { getDict().LogNewSplit(blob_number); updated_best_choice = getDict().permute_characters(*char_choices, word->best_choice, word->raw_choice); if (old_best > word->best_choice->rating()) { set_n_ones(best_state, char_choices->length() - 1); fixpt_valid = 1; } else { insert_new_chunk(best_state, blob_number, char_choices->length() - 2); fixpt_valid = 0; } if (chop_debug) print_state("best state = ", best_state, count_blobs(word->chopped_word->blobs) - 1); } else { break; } // Check if we should break from the loop. bool done = false; bool replaced = false; if ((updated_best_choice && (*best_choice_acceptable = getDict().AcceptableChoice(char_choices, word->best_choice, fixpt, CHOPPER_CALLER, &replaced))) || char_choices->length() >= MAX_NUM_CHUNKS) { done = true; } if (replaced) update_blob_classifications(word->chopped_word, *char_choices); if (updated_best_choice) CopyCharChoices(*char_choices, best_char_choices); if (done) break; } if (!fixpt_valid) fixpt->clear(); }
/********************************************************************** * record_blob_bounds * * Set up and initialize an array that holds the bounds of a set of * blobs. Caller should delete[] the array. **********************************************************************/ TBOX *Wordrec::record_blob_bounds(TBLOB *blobs) { int nblobs = count_blobs(blobs); TBOX *bboxes = new TBOX[nblobs]; inT16 x = 0; for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) { bboxes[x] = blob->bounding_box(); x++; } return bboxes; }
/********************************************************************** * record_blob_bounds * * Set up and initialize an array that holds the bounds of a set of * blobs. **********************************************************************/ BOUNDS_LIST record_blob_bounds(TBLOB *blobs) { TBLOB *blob; BOUNDS_LIST bounds; TPOINT topleft; TPOINT botright; INT16 x = 0; bounds = (BOUNDS_LIST) memalloc (count_blobs (blobs) * sizeof (BOUNDS)); iterate_blobs(blob, blobs) { blob_bounding_box(blob, &topleft, &botright); set_bounds_entry(bounds, x, topleft, botright); x++; }
/** * evaluate_chunks * * A particular word level segmentation has been chosen. Evaluation * this to find the word list that corresponds to it. */ BLOB_CHOICE_LIST_VECTOR *Wordrec::evaluate_chunks(CHUNKS_RECORD *chunks_record, SEARCH_STATE search_state, BlamerBundle *blamer_bundle) { BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR(); BLOB_CHOICE_LIST *blob_choices; BLOB_CHOICE_IT blob_choice_it; int i; int x = 0; int y; // Iterate sub-paths. for (i = 1; i <= search_state[0] + 1; i++) { if (i > search_state[0]) y = count_blobs (chunks_record->chunks) - 1; else y = x + search_state[i]; // Process one square. // Classify if needed. blob_choices = get_piece_rating(chunks_record->ratings, chunks_record->chunks, chunks_record->word_res->denorm, chunks_record->splits, x, y, blamer_bundle); if (blob_choices == NULL) { delete char_choices; return (NULL); } // Add permuted ratings. blob_choice_it.set_to_list(blob_choices); last_segmentation[i - 1].certainty = blob_choice_it.data()->certainty(); last_segmentation[i - 1].match = blob_choice_it.data()->rating(); last_segmentation[i - 1].width = AssociateUtils::GetChunksWidth(chunks_record->chunk_widths, x, y); last_segmentation[i - 1].gap = AssociateUtils::GetChunksGap(chunks_record->chunk_widths, y); *char_choices += blob_choices; x = y + 1; } return (char_choices); }
/** * evaluate_chunks * * A particular word level segmentation has been chosen. Evaluation * this to find the word list that corresponds to it. */ BLOB_CHOICE_LIST_VECTOR *Wordrec::evaluate_chunks(CHUNKS_RECORD *chunks_record, SEARCH_STATE search_state) { BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR(); BLOB_CHOICE_LIST *blob_choices; BLOB_CHOICE_IT blob_choice_it; int i; int x = 0; int y; /* Iterate sub-paths */ for (i = 1; i <= search_state[0] + 1; i++) { if (i > search_state[0]) y = count_blobs (chunks_record->chunks) - 1; else y = x + search_state[i]; if (tord_blob_skip) { delete char_choices; return (NULL); } /* Process one square */ /* Classify if needed */ blob_choices = get_piece_rating(chunks_record->ratings, chunks_record->chunks, chunks_record->splits, x, y); if (blob_choices == NULL) { delete char_choices; return (NULL); } /* Add permuted ratings */ blob_choice_it.set_to_list(blob_choices); last_segmentation[i - 1].certainty = blob_choice_it.data()->certainty(); last_segmentation[i - 1].match = blob_choice_it.data()->rating(); last_segmentation[i - 1].width = chunks_width (chunks_record->chunk_widths, x, y); last_segmentation[i - 1].gap = chunks_gap (chunks_record->chunk_widths, y); *char_choices += blob_choices; x = y + 1; } return (char_choices); }
/********************************************************************** * record_piece_ratings * * Save the choices for all the pieces that have been classified into * a matrix that can be used to look them up later. A two dimensional * matrix is created. The indices correspond to the starting and * ending initial piece number. **********************************************************************/ MATRIX *Wordrec::record_piece_ratings(TBLOB *blobs) { inT16 num_blobs = count_blobs(blobs); TBOX *bounds = record_blob_bounds(blobs); MATRIX *ratings = new MATRIX(num_blobs); for (int x = 0; x < num_blobs; x++) { for (int y = x; y < num_blobs; y++) { TBOX piecebox = bounds_of_piece(bounds, x, y); BLOB_CHOICE_LIST *choices = blob_match_table.get_match_by_box(piecebox); if (choices != NULL) { ratings->put(x, y, choices); } } } if (merge_fragments_in_matrix) merge_fragments(ratings, num_blobs); delete []bounds; return ratings; }
/** * rebuild_current_state * * Transfers the given state to the word's output fields: rebuild_word, * best_state, box_word, and returns the corresponding blob choices. */ BLOB_CHOICE_LIST_VECTOR *Wordrec::rebuild_current_state( WERD_RES *word, STATE *state, BLOB_CHOICE_LIST_VECTOR *old_choices, MATRIX *ratings) { // Initialize search_state, num_joints, x, y. int num_joints = array_count(word->seam_array); #ifndef GRAPHICS_DISABLED if (wordrec_display_segmentations) { print_state("Rebuilding state", state, num_joints); } #endif // Setup the rebuild_word ready for the output blobs. if (word->rebuild_word != NULL) delete word->rebuild_word; word->rebuild_word = new TWERD; // Setup the best_state. word->best_state.clear(); SEARCH_STATE search_state = bin_to_chunks(state, num_joints); // See which index is which below for information on x and y. int x = 0; int y; for (int i = 1; i <= search_state[0]; i++) { y = x + search_state[i]; x = y + 1; } y = count_blobs(word->chopped_word->blobs) - 1; // Initialize char_choices, expanded_fragment_lengths: // e.g. if fragment_lengths = {1 1 2 3 1}, // expanded_fragment_lengths_str = {1 1 2 2 3 3 3 1}. BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR(); STRING expanded_fragment_lengths_str = ""; bool state_has_fragments = false; const char *fragment_lengths = NULL; if (word->best_choice->length() > 0) { fragment_lengths = word->best_choice->fragment_lengths(); } if (fragment_lengths) { for (int i = 0; i < word->best_choice->length(); ++i) { *char_choices += NULL; word->best_state.push_back(0); if (fragment_lengths[i] > 1) { state_has_fragments = true; } for (int j = 0; j < fragment_lengths[i]; ++j) { expanded_fragment_lengths_str += fragment_lengths[i]; } } } else { for (int i = 0; i <= search_state[0]; ++i) { expanded_fragment_lengths_str += (char)1; *char_choices += NULL; word->best_state.push_back(0); } } // Set up variables for concatenating fragments. const char *word_lengths_ptr = NULL; const char *word_ptr = NULL; if (state_has_fragments) { // Make word_lengths_ptr point to the last element in // best_choice->unichar_lengths(). word_lengths_ptr = word->best_choice->unichar_lengths().string(); word_lengths_ptr += (strlen(word_lengths_ptr)-1); // Make word_str point to the beginning of the last // unichar in best_choice->unichar_string(). word_ptr = word->best_choice->unichar_string().string(); word_ptr += (strlen(word_ptr)-*word_lengths_ptr); } const char *expanded_fragment_lengths = expanded_fragment_lengths_str.string(); char unichar[UNICHAR_LEN + 1]; // Populate char_choices list such that it corresponds to search_state. // // If we are rebuilding a state that contains character fragments: // -- combine blobs that belong to character fragments // -- re-classify the blobs to obtain choices list for the merged blob // -- ensure that correct classification appears in the new choices list // NOTE: a choice composed form original fragment choices will be always // added to the new choices list for each character composed from // fragments (even if the choice for the corresponding character appears // in the re-classified choices list of for the newly merged blob). int ss_index = search_state[0]; // Which index is which? // char_choices_index refers to the finished product: there is one for each // blob/unicharset entry in the final word. // ss_index refers to the search_state, and indexes a group (chunk) of blobs // that were classified together for the best state. // old_choice_index is a copy of ss_index, and accesses the old_choices, // which correspond to chunks in the best state. old_choice_index gets // set to -1 on a fragment set, as there is no corresponding chunk in // the best state. // x and y refer to the underlying blobs and are the first and last blob // indices in a chunk. for (int char_choices_index = char_choices->length() - 1; char_choices_index >= 0; --char_choices_index) { // The start and end of the blob to rebuild. int true_x = x; int true_y = y; // The fake merged fragment choice. BLOB_CHOICE* merged_choice = NULL; // Test for and combine fragments first. int fragment_pieces = expanded_fragment_lengths[ss_index]; int old_choice_index = ss_index; if (fragment_pieces > 1) { strncpy(unichar, word_ptr, *word_lengths_ptr); unichar[*word_lengths_ptr] = '\0'; merged_choice = rebuild_fragments(unichar, expanded_fragment_lengths, old_choice_index, old_choices); old_choice_index = -1; } while (fragment_pieces > 0) { true_x = x; // Move left to the previous blob. y = x - 1; x = y - search_state[ss_index--]; --fragment_pieces; } word->best_state[char_choices_index] = true_y + 1 - true_x; BLOB_CHOICE_LIST *current_choices = join_blobs_and_classify( word, true_x, true_y, old_choice_index, ratings, old_choices); if (merged_choice != NULL) { // Insert merged_blob into current_choices, such that current_choices // are still sorted in non-descending order by rating. ASSERT_HOST(!current_choices->empty()); BLOB_CHOICE_IT choice_it(current_choices); for (choice_it.mark_cycle_pt(); !choice_it.cycled_list() && merged_choice->rating() > choice_it.data()->rating(); choice_it.forward()) choice_it.add_before_stay_put(merged_choice); } // Get rid of fragments in current_choices. BLOB_CHOICE_IT choice_it(current_choices); for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { if (getDict().getUnicharset().get_fragment( choice_it.data()->unichar_id())) { delete choice_it.extract(); } } char_choices->set(current_choices, char_choices_index); // Update word_ptr and word_lengths_ptr. if (word_lengths_ptr != NULL && word_ptr != NULL) { word_lengths_ptr--; word_ptr -= (*word_lengths_ptr); } } old_choices->delete_data_pointers(); delete old_choices; memfree(search_state); return char_choices; }
/** * rebuild_current_state * * Evaluate the segmentation that is represented by this state in the * best first search. Add this state to the "states_seen" list. */ BLOB_CHOICE_LIST_VECTOR *Wordrec::rebuild_current_state( TBLOB *blobs, SEAMS seam_list, STATE *state, BLOB_CHOICE_LIST_VECTOR *old_choices, int fx, bool force_rebuild, const WERD_CHOICE &best_choice, const MATRIX *ratings) { // Initialize search_state, num_joints, x, y. int num_joints = array_count(seam_list); #ifndef GRAPHICS_DISABLED if (wordrec_display_segmentations) { print_state("Rebuiling state", state, num_joints); } #endif SEARCH_STATE search_state = bin_to_chunks(state, num_joints); int x = 0; int y; int i; for (i = 1; i <= search_state[0]; i++) { y = x + search_state[i]; x = y + 1; } y = count_blobs (blobs) - 1; // Initialize char_choices, expanded_fragment_lengths: // e.g. if fragment_lengths = {1 1 2 3 1}, // expanded_fragment_lengths_str = {1 1 2 2 3 3 3 1}. BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR(); STRING expanded_fragment_lengths_str = ""; bool state_has_fragments = false; const char *fragment_lengths = NULL; if (best_choice.length() > 0) { fragment_lengths = best_choice.fragment_lengths(); } if (fragment_lengths) { for (int i = 0; i < best_choice.length(); ++i) { *char_choices += NULL; if (fragment_lengths[i] > 1) { state_has_fragments = true; } for (int j = 0; j < fragment_lengths[i]; ++j) { expanded_fragment_lengths_str += fragment_lengths[i]; } } } else { for (i = 0; i <= search_state[0]; ++i) { expanded_fragment_lengths_str += (char)1; *char_choices += NULL; } } // Finish early if force_rebuld is false and there are no fragments to merge. if (!force_rebuild && !state_has_fragments) { delete char_choices; memfree(search_state); return old_choices; } // Set up variables for concatenating fragments. const char *word_lengths_ptr = NULL; const char *word_ptr = NULL; if (state_has_fragments) { // Make word_lengths_ptr point to the last element in // best_choice->unichar_lengths(). word_lengths_ptr = best_choice.unichar_lengths().string(); word_lengths_ptr += (strlen(word_lengths_ptr)-1); // Make word_str point to the beginning of the last // unichar in best_choice->unichar_string(). word_ptr = best_choice.unichar_string().string(); word_ptr += (strlen(word_ptr)-*word_lengths_ptr); } const char *expanded_fragment_lengths = expanded_fragment_lengths_str.string(); bool merging_fragment = false; int true_y = -1; char unichar[UNICHAR_LEN + 1]; int fragment_pieces = -1; float rating = 0.0; float certainty = -MAX_FLOAT32; // Populate char_choices list such that it corresponds to search_state. // // If we are rebuilding a state that contains character fragments: // -- combine blobs that belong to character fragments // -- re-classify the blobs to obtain choices list for the merged blob // -- ensure that correct classification appears in the new choices list // NOTE: a choice composed form original fragment choices will be always // added to the new choices list for each character composed from // fragments (even if the choice for the corresponding character appears // in the re-classified choices list of for the newly merged blob). BLOB_CHOICE_IT temp_it; int char_choices_index = char_choices->length() - 1; for (i = search_state[0]; i >= 0; i--) { BLOB_CHOICE_LIST *current_choices = join_blobs_and_classify( blobs, seam_list, x, y, fx, ratings, old_choices); // Combine character fragments. if (expanded_fragment_lengths[i] > 1) { // Start merging character fragments. if (!merging_fragment) { merging_fragment = true; true_y = y; fragment_pieces = expanded_fragment_lengths[i]; rating = 0.0; certainty = -MAX_FLOAT32; strncpy(unichar, word_ptr, *word_lengths_ptr); unichar[*word_lengths_ptr] = '\0'; } // Take into account the fact that we could have joined pieces // since we first recorded the ending point of a fragment (true_y). true_y -= y - x; // Populate fragment with updated values and look for the // fragment with the same values in current_choices. // Update rating and certainty of the character being composed. fragment_pieces--; CHAR_FRAGMENT fragment; fragment.set_all(unichar, fragment_pieces, expanded_fragment_lengths[i]); temp_it.set_to_list(current_choices); for (temp_it.mark_cycle_pt(); !temp_it.cycled_list(); temp_it.forward()) { const CHAR_FRAGMENT *current_fragment = getDict().getUnicharset().get_fragment(temp_it.data()->unichar_id()); if (current_fragment && fragment.equals(current_fragment)) { rating += temp_it.data()->rating(); if (temp_it.data()->certainty() > certainty) { certainty = temp_it.data()->certainty(); } break; } } assert(!temp_it.cycled_list()); // make sure we found the fragment // Free current_choices for the fragmented character. delete current_choices; // Finish composing character from fragments. if (fragment_pieces == 0) { // Populate current_choices with the classification of // the blob merged from blobs of each character fragment. current_choices = join_blobs_and_classify(blobs, seam_list, x, true_y, fx, ratings, NULL); BLOB_CHOICE *merged_choice = new BLOB_CHOICE(getDict().getUnicharset().unichar_to_id(unichar), rating, certainty, 0, NO_PERM); // Insert merged_blob into current_choices, such that current_choices // are still sorted in non-descending order by rating. ASSERT_HOST(!current_choices->empty()); temp_it.set_to_list(current_choices); for (temp_it.mark_cycle_pt(); !temp_it.cycled_list() && merged_choice->rating() > temp_it.data()->rating(); temp_it.forward()); temp_it.add_before_stay_put(merged_choice); // Done merging this fragmented character. merging_fragment = false; } } if (!merging_fragment) { // Get rid of fragments in current_choices. temp_it.set_to_list(current_choices); for (temp_it.mark_cycle_pt(); !temp_it.cycled_list(); temp_it.forward()) { if (getDict().getUnicharset().get_fragment( temp_it.data()->unichar_id())) { delete temp_it.extract(); } } char_choices->set(current_choices, char_choices_index); char_choices_index--; // Update word_ptr and word_lengths_ptr. if (word_lengths_ptr != NULL && word_ptr != NULL) { word_lengths_ptr--; word_ptr -= (*word_lengths_ptr); } } y = x - 1; x = y - search_state[i]; } old_choices->delete_data_pointers(); delete old_choices; memfree(search_state); return (char_choices); }
/********************************************************************** * improve_by_chopping * * Start with the current word of blobs and its classification. Find * the worst blobs and try to divide them up to improve the ratings. * As long as ratings are produced by the new blob splitting. When * all the splitting has been accomplished all the ratings memory is * reclaimed. **********************************************************************/ void improve_by_chopping(register TWERD *word, CHOICES_LIST *char_choices, int fx, STATE *best_state, A_CHOICE *best_choice, A_CHOICE *raw_choice, SEAMS *seam_list, DANGERR *fixpt, STATE *chop_states, INT32 *state_count, STATE *correct_state, INT32 pass) { INT32 blob_number; INT32 index; //to states CHOICES_LIST choices = *char_choices; float old_best; int fixpt_valid = 1; static INT32 old_count; //from pass1 do { /* Improvement loop */ if (!fixpt_valid) fixpt->index = -1; old_best = class_probability (best_choice); choices = improve_one_blob (word, *char_choices, fx, &blob_number, seam_list, fixpt, chop_states + *state_count, correct_state, pass); if (choices != NULL) { LogNewSplit(blob_number); permute_characters (choices, class_probability (best_choice), best_choice, raw_choice); *char_choices = choices; if (old_best > class_probability (best_choice)) { set_n_ones (best_state, array_count (*char_choices) - 1); fixpt_valid = 1; } else { insert_new_chunk (best_state, blob_number, array_count (*char_choices) - 2); fixpt_valid = 0; } if (*state_count > 0) { if (pass == 0) { for (index = 0; index < *state_count; index++) insert_new_chunk (&chop_states[index], blob_number, array_count (*char_choices) - 2); set_n_ones (&chop_states[index], array_count (*char_choices) - 1); } (*state_count)++; } if (chop_debug) print_state ("best state = ", best_state, count_blobs (word->blobs) - 1); if (first_pass) chops_performed1++; else chops_performed2++; } } while (choices && !AcceptableChoice (*char_choices, best_choice, raw_choice, fixpt) && !blob_skip && array_count (*char_choices) < MAX_NUM_CHUNKS); if (pass == 0) old_count = *state_count; else { if (old_count != *state_count) fprintf (matcher_fp, "Mis-matched state counts, " INT32FORMAT " pass1, " INT32FORMAT " pass2\n", old_count, *state_count); } if (!fixpt_valid) fixpt->index = -1; }