/** * @name improve_by_chopping * * Start with the current word of blobs and its classification. Find * the worst blobs and try to divide them up to improve the ratings. * As long as ratings are produced by the new blob splitting. When * all the splitting has been accomplished all the ratings memory is * reclaimed. */ void Wordrec::improve_by_chopping(WERD_RES *word, BLOB_CHOICE_LIST_VECTOR *char_choices, STATE *best_state, BLOB_CHOICE_LIST_VECTOR *best_char_choices, DANGERR *fixpt, bool *best_choice_acceptable) { inT32 blob_number; float old_best; int fixpt_valid = 1; bool updated_best_choice = false; while (1) { // improvement loop if (!fixpt_valid) fixpt->clear(); old_best = word->best_choice->rating(); if (improve_one_blob(word->chopped_word, char_choices, &blob_number, &word->seam_array, fixpt, (fragments_guide_chopper && word->best_choice->fragment_mark()))) { getDict().LogNewSplit(blob_number); updated_best_choice = getDict().permute_characters(*char_choices, word->best_choice, word->raw_choice); if (old_best > word->best_choice->rating()) { set_n_ones(best_state, char_choices->length() - 1); fixpt_valid = 1; } else { insert_new_chunk(best_state, blob_number, char_choices->length() - 2); fixpt_valid = 0; } if (chop_debug) print_state("best state = ", best_state, count_blobs(word->chopped_word->blobs) - 1); } else { break; } // Check if we should break from the loop. bool done = false; bool replaced = false; if ((updated_best_choice && (*best_choice_acceptable = getDict().AcceptableChoice(char_choices, word->best_choice, fixpt, CHOPPER_CALLER, &replaced))) || char_choices->length() >= MAX_NUM_CHUNKS) { done = true; } if (replaced) update_blob_classifications(word->chopped_word, *char_choices); if (updated_best_choice) CopyCharChoices(*char_choices, best_char_choices); if (done) break; } if (!fixpt_valid) fixpt->clear(); }
/** * @name evaluate_state * * Evaluate the segmentation that is represented by this state in the * best first search. Add this state to the "states_seen" list. */ inT16 Wordrec::evaluate_state(CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search, DANGERR *fixpt, BlamerBundle *blamer_bundle) { BLOB_CHOICE_LIST_VECTOR *char_choices; SEARCH_STATE chunk_groups; float rating_limit = the_search->best_choice->rating(); bool keep_going = true; PIECES_STATE widths; the_search->num_states++; chunk_groups = bin_to_chunks(the_search->this_state, the_search->num_joints); bin_to_pieces (the_search->this_state, the_search->num_joints, widths); if (wordrec_debug_level > 1) { log_state("Evaluating state", the_search->num_joints, the_search->this_state); } getDict().LogNewSegmentation(widths); char_choices = evaluate_chunks(chunks_record, chunk_groups, blamer_bundle); getDict().SetWordsegRatingAdjustFactor(-1.0f); bool updated_best_choice = false; if (char_choices != NULL && char_choices->length() > 0) { // Compute the segmentation cost and include the cost in word rating. // TODO(dsl): We should change the SEARCH_RECORD to store this cost // from state evaluation and avoid recomputing it here. prioritize_state(chunks_record, the_search); getDict().SetWordsegRatingAdjustFactor(the_search->segcost_bias); updated_best_choice = getDict().permute_characters(*char_choices, the_search->best_choice, the_search->raw_choice); bool replaced = false; if (updated_best_choice) { if (getDict().AcceptableChoice(char_choices, the_search->best_choice, NULL, ASSOCIATOR_CALLER, &replaced)) { keep_going = false; } CopyCharChoices(*char_choices, the_search->best_char_choices); } } getDict().SetWordsegRatingAdjustFactor(-1.0f); #ifndef GRAPHICS_DISABLED if (wordrec_display_segmentations) { display_segmentation (chunks_record->chunks, chunk_groups); if (wordrec_display_segmentations > 1) window_wait(segm_window); } #endif if (rating_limit != the_search->best_choice->rating()) { ASSERT_HOST(updated_best_choice); the_search->before_best = the_search->num_states; the_search->best_state->part1 = the_search->this_state->part1; the_search->best_state->part2 = the_search->this_state->part2; replace_char_widths(chunks_record, chunk_groups); } else { ASSERT_HOST(!updated_best_choice); if (char_choices != NULL) fixpt->clear(); } if (char_choices != NULL) delete char_choices; memfree(chunk_groups); return (keep_going); }
/** * @name chop_word_main * * Classify the blobs in this word and permute the results. Find the * worst blob in the word and chop it up. Continue this process until * a good answer has been found or all the blobs have been chopped up * enough. Return the word level ratings. */ BLOB_CHOICE_LIST_VECTOR *Wordrec::chop_word_main(WERD_RES *word) { TBLOB *blob; int index; int did_chopping; STATE state; BLOB_CHOICE_LIST *match_result; MATRIX *ratings = NULL; DANGERR fixpt; /*dangerous ambig */ inT32 bit_count; //no of bits set_denorm(&word->denorm); BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR(); BLOB_CHOICE_LIST_VECTOR *best_char_choices = new BLOB_CHOICE_LIST_VECTOR(); did_chopping = 0; for (blob = word->chopped_word->blobs, index = 0; blob != NULL; blob = blob->next, index++) { match_result = classify_blob(blob, "chop_word:", Green); if (match_result == NULL) cprintf("Null classifier output!\n"); *char_choices += match_result; } bit_count = index - 1; set_n_ones(&state, char_choices->length() - 1); bool acceptable = false; bool replaced = false; bool best_choice_updated = getDict().permute_characters(*char_choices, word->best_choice, word->raw_choice); if (best_choice_updated && getDict().AcceptableChoice(char_choices, word->best_choice, &fixpt, CHOPPER_CALLER, &replaced)) { acceptable = true; } if (replaced) update_blob_classifications(word->chopped_word, *char_choices); CopyCharChoices(*char_choices, best_char_choices); if (!acceptable) { // do more work to find a better choice did_chopping = 1; bool best_choice_acceptable = false; if (chop_enable) improve_by_chopping(word, char_choices, &state, best_char_choices, &fixpt, &best_choice_acceptable); if (chop_debug) print_seams ("Final seam list:", word->seam_array); // The force_word_assoc is almost redundant to enable_assoc. However, // it is not conditioned on the dict behavior. For CJK, we need to force // the associator to be invoked. When we figure out the exact behavior // of dict on CJK, we can remove the flag if it turns out to be redundant. if ((wordrec_enable_assoc && !best_choice_acceptable) || force_word_assoc) { ratings = word_associator(word, &state, best_char_choices, &fixpt, &state); } } best_char_choices = rebuild_current_state(word, &state, best_char_choices, ratings); if (ratings != NULL) { if (wordrec_debug_level > 0) { tprintf("Final Ratings Matrix:\n"); ratings->print(getDict().getUnicharset()); } ratings->delete_matrix_pointers(); delete ratings; } getDict().FilterWordChoices(); char_choices->delete_data_pointers(); delete char_choices; return best_char_choices; }