// Returns a bigger MATRIX with a new column and row in the matrix in order // to split the blob at the given (ind,ind) diagonal location. // Entries are relocated to the new MATRIX using the transformation defined // by MATRIX_COORD::MapForSplit. // Transfers the pointer data to the new MATRIX and deletes *this. MATRIX* MATRIX::ConsumeAndMakeBigger(int ind) { int dim = dimension(); int band_width = bandwidth(); // Check to see if bandwidth needs expanding. for (int col = ind; col >= 0 && col > ind - band_width; --col) { if (array_[col * band_width + band_width - 1] != empty_) { ++band_width; break; } } MATRIX* result = new MATRIX(dim + 1, band_width); for (int col = 0; col < dim; ++col) { for (int row = col; row < dim && row < col + bandwidth(); ++row) { MATRIX_COORD coord(col, row); coord.MapForSplit(ind); BLOB_CHOICE_LIST* choices = get(col, row); if (choices != NULL) { // Correct matrix location on each choice. BLOB_CHOICE_IT bc_it(choices); for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) { BLOB_CHOICE* choice = bc_it.data(); choice->set_matrix_cell(coord.col, coord.row); } ASSERT_HOST(coord.Valid(*result)); result->put(coord.col, coord.row, choices); } } } delete this; return result; }
/** * @name chop_word_main * * Classify the blobs in this word and permute the results. Find the * worst blob in the word and chop it up. Continue this process until * a good answer has been found or all the blobs have been chopped up * enough. The results are returned in the WERD_RES. */ void Wordrec::chop_word_main(WERD_RES *word) { int num_blobs = word->chopped_word->NumBlobs(); if (word->ratings == NULL) { word->ratings = new MATRIX(num_blobs, wordrec_max_join_chunks); } if (word->ratings->get(0, 0) == NULL) { // Run initial classification. for (int b = 0; b < num_blobs; ++b) { BLOB_CHOICE_LIST* choices = classify_piece(word->seam_array, b, b, "Initial:", word->chopped_word, word->blamer_bundle); word->ratings->put(b, b, choices); } } else { // Blobs have been pre-classified. Set matrix cell for all blob choices for (int col = 0; col < word->ratings->dimension(); ++col) { for (int row = col; row < word->ratings->dimension() && row < col + word->ratings->bandwidth(); ++row) { BLOB_CHOICE_LIST* choices = word->ratings->get(col, row); if (choices != NULL) { BLOB_CHOICE_IT bc_it(choices); for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) { bc_it.data()->set_matrix_cell(col, row); } } } } } // Run Segmentation Search. BestChoiceBundle best_choice_bundle(word->ratings->dimension()); SegSearch(word, &best_choice_bundle, word->blamer_bundle); if (word->best_choice == NULL) { // SegSearch found no valid paths, so just use the leading diagonal. word->FakeWordFromRatings(); } word->RebuildBestState(); // If we finished without a hyphen at the end of the word, let the next word // be found in the dictionary. if (word->word->flag(W_EOL) && !getDict().has_hyphen_end(*word->best_choice)) { getDict().reset_hyphen_vars(true); } if (word->blamer_bundle != NULL && this->fill_lattice_ != NULL) { CallFillLattice(*word->ratings, word->best_choices, *word->uch_set, word->blamer_bundle); } if (wordrec_debug_level > 0) { tprintf("Final Ratings Matrix:\n"); word->ratings->print(getDict().getUnicharset()); } word->FilterWordChoices(getDict().stopper_debug_level); }
BLOB_CHOICE_LIST *Wordrec::classify_piece(const GenericVector<SEAM*>& seams, inT16 start, inT16 end, const char* description, TWERD *word, BlamerBundle *blamer_bundle) { if (end > start) join_pieces(seams, start, end, word); BLOB_CHOICE_LIST *choices = classify_blob(word->blobs[start], description, White, blamer_bundle); // Set the matrix_cell_ entries in all the BLOB_CHOICES. BLOB_CHOICE_IT bc_it(choices); for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) { bc_it.data()->set_matrix_cell(start, end); } if (end > start) break_pieces(seams, start, end, word); return (choices); }
// Helper recursively prints all paths through the ratings matrix, starting // at column col. static void PrintMatrixPaths(int col, int dim, const MATRIX& ratings, int length, const BLOB_CHOICE** blob_choices, const UNICHARSET& unicharset, const char *label, FILE *output_file) { for (int row = col; row < dim && row - col < ratings.bandwidth(); ++row) { if (ratings.get(col, row) != NOT_CLASSIFIED) { BLOB_CHOICE_IT bc_it(ratings.get(col, row)); for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) { blob_choices[length] = bc_it.data(); if (row + 1 < dim) { PrintMatrixPaths(row + 1, dim, ratings, length + 1, blob_choices, unicharset, label, output_file); } else { PrintPath(length + 1, blob_choices, unicharset, label, output_file); } } } } }
// Moved from speckle.cpp // Adds a noise classification result that is a bit worse than the worst // current result, or the worst possible result if no current results. void Classify::AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices) { BLOB_CHOICE_IT bc_it(choices); // If there is no classifier result, we will use the worst possible certainty // and corresponding rating. float certainty = -getDict().certainty_scale; float rating = rating_scale * blob_length; if (!choices->empty() && blob_length > 0) { bc_it.move_to_last(); BLOB_CHOICE* worst_choice = bc_it.data(); // Add speckle_rating_penalty to worst rating, matching old value. rating = worst_choice->rating() + speckle_rating_penalty; // Compute the rating to correspond to the certainty. (Used to be kept // the same, but that messes up the language model search.) certainty = -rating * getDict().certainty_scale / (rating_scale * blob_length); } BLOB_CHOICE* blob_choice = new BLOB_CHOICE(UNICHAR_SPACE, rating, certainty, -1, -1, 0, 0, MAX_FLOAT32, 0, BCC_SPECKLE_CLASSIFIER); bc_it.add_to_end(blob_choice); }
/** * @name improve_by_chopping * * Repeatedly chops the worst blob, classifying the new blobs fixing up all * the data, and incrementally runs the segmentation search until a good word * is found, or no more chops can be found. */ void Wordrec::improve_by_chopping(float rating_cert_scale, WERD_RES* word, BestChoiceBundle* best_choice_bundle, BlamerBundle* blamer_bundle, LMPainPoints* pain_points, GenericVector<SegSearchPending>* pending) { int blob_number; do { // improvement loop. // Make a simple vector of BLOB_CHOICEs to make it easy to pick which // one to chop. GenericVector<BLOB_CHOICE*> blob_choices; int num_blobs = word->ratings->dimension(); for (int i = 0; i < num_blobs; ++i) { BLOB_CHOICE_LIST* choices = word->ratings->get(i, i); if (choices == NULL || choices->empty()) { blob_choices.push_back(NULL); } else { BLOB_CHOICE_IT bc_it(choices); blob_choices.push_back(bc_it.data()); } } SEAM* seam = improve_one_blob(blob_choices, &best_choice_bundle->fixpt, false, false, word, &blob_number); if (seam == NULL) break; // A chop has been made. We have to correct all the data structures to // take into account the extra bottom-level blob. // Put the seam into the seam_array and correct everything else on the // word: ratings matrix (including matrix location in the BLOB_CHOICES), // states in WERD_CHOICEs, and blob widths. word->InsertSeam(blob_number, seam); // Insert a new entry in the beam array. best_choice_bundle->beam.insert(new LanguageModelState, blob_number); // Fixpts are outdated, but will get recalculated. best_choice_bundle->fixpt.clear(); // Remap existing pain points. pain_points->RemapForSplit(blob_number); // Insert a new pending at the chop point. pending->insert(SegSearchPending(), blob_number); // Classify the two newly created blobs using ProcessSegSearchPainPoint, // as that updates the pending correctly and adds new pain points. MATRIX_COORD pain_point(blob_number, blob_number); ProcessSegSearchPainPoint(0.0f, pain_point, "Chop1", pending, word, pain_points, blamer_bundle); pain_point.col = blob_number + 1; pain_point.row = blob_number + 1; ProcessSegSearchPainPoint(0.0f, pain_point, "Chop2", pending, word, pain_points, blamer_bundle); if (language_model_->language_model_ngram_on) { // N-gram evaluation depends on the number of blobs in a chunk, so we // have to re-evaluate everything in the word. ResetNGramSearch(word, best_choice_bundle, pending); blob_number = 0; } // Run language model incrementally. (Except with the n-gram model on.) UpdateSegSearchNodes(rating_cert_scale, blob_number, pending, word, pain_points, best_choice_bundle, blamer_bundle); } while (!language_model_->AcceptableChoiceFound() && word->ratings->dimension() < kMaxNumChunks); // If after running only the chopper best_choice is incorrect and no blame // has been yet set, blame the classifier if best_choice is classifier's // top choice and is a dictionary word (i.e. language model could not have // helped). Otherwise blame the tradeoff between the classifier and // the old language model (permuters). if (word->blamer_bundle != NULL && word->blamer_bundle->incorrect_result_reason() == IRR_CORRECT && !word->blamer_bundle->ChoiceIsCorrect(word->best_choice)) { bool valid_permuter = word->best_choice != NULL && Dict::valid_word_permuter(word->best_choice->permuter(), false); word->blamer_bundle->BlameClassifierOrLangModel(word, getDict().getUnicharset(), valid_permuter, wordrec_debug_blamer); } }
bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings) { if (stopper_debug_level > 2) { tprintf("\nRunning NoDangerousAmbig() for %s\n", best_choice->debug_string().string()); } // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities // for each unichar id in BestChoice. BLOB_CHOICE_LIST_VECTOR ambig_blob_choices; int i; bool ambigs_found = false; // For each position in best_choice: // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i] // -- initialize wrong_ngram with a single unichar_id at best_choice[i] // -- look for ambiguities corresponding to wrong_ngram in the list while // adding the following unichar_ids from best_choice to wrong_ngram // // Repeat the above procedure twice: first time look through // ambigs to be replaced and replace all the ambiguities found; // second time look through dangerous ambiguities and construct // ambig_blob_choices with fake a blob choice for each ambiguity // and pass them to dawg_permute_and_select() to search for // ambiguous words in the dictionaries. // // Note that during the execution of the for loop (on the first pass) // if replacements are made the length of best_choice might change. for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) { bool replace = (fix_replaceable && pass == 0); const UnicharAmbigsVector &table = replace ? getUnicharAmbigs().replace_ambigs() : getUnicharAmbigs().dang_ambigs(); if (!replace) { // Initialize ambig_blob_choices with lists containing a single // unichar id for the correspoding position in best_choice. // best_choice consisting from only the original letters will // have a rating of 0.0. for (i = 0; i < best_choice->length(); ++i) { BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST(); BLOB_CHOICE_IT lst_it(lst); // TODO(rays/antonova) Put real xheights and y shifts here. lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i), 0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG)); ambig_blob_choices.push_back(lst); } } UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1]; int wrong_ngram_index; int next_index; int blob_index = 0; for (i = 0; i < best_choice->length(); blob_index += best_choice->state(i), ++i) { UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i); if (stopper_debug_level > 2) { tprintf("Looking for %s ngrams starting with %s:\n", replace ? "replaceable" : "ambiguous", getUnicharset().debug_str(curr_unichar_id).string()); } int num_wrong_blobs = best_choice->state(i); wrong_ngram_index = 0; wrong_ngram[wrong_ngram_index] = curr_unichar_id; if (curr_unichar_id == INVALID_UNICHAR_ID || curr_unichar_id >= table.size() || table[curr_unichar_id] == nullptr) { continue; // there is no ambig spec for this unichar id } AmbigSpec_IT spec_it(table[curr_unichar_id]); for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) { const AmbigSpec *ambig_spec = spec_it.data(); wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID; int compare = UnicharIdArrayUtils::compare(wrong_ngram, ambig_spec->wrong_ngram); if (stopper_debug_level > 2) { tprintf("candidate ngram: "); UnicharIdArrayUtils::print(wrong_ngram, getUnicharset()); tprintf("current ngram from spec: "); UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset()); tprintf("comparison result: %d\n", compare); } if (compare == 0) { // Record the place where we found an ambiguity. if (fixpt != nullptr) { UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0]; fixpt->push_back(DANGERR_INFO( blob_index, blob_index + num_wrong_blobs, replace, getUnicharset().get_isngram(ambig_spec->correct_ngram_id), leftmost_id)); if (stopper_debug_level > 1) { tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index, blob_index + num_wrong_blobs, false, getUnicharset().get_isngram( ambig_spec->correct_ngram_id), getUnicharset().id_to_unichar(leftmost_id)); } } if (replace) { if (stopper_debug_level > 2) { tprintf("replace ambiguity with %s : ", getUnicharset().id_to_unichar( ambig_spec->correct_ngram_id)); UnicharIdArrayUtils::print( ambig_spec->correct_fragments, getUnicharset()); } ReplaceAmbig(i, ambig_spec->wrong_ngram_size, ambig_spec->correct_ngram_id, best_choice, ratings); } else if (i > 0 || ambig_spec->type != CASE_AMBIG) { // We found dang ambig - update ambig_blob_choices. if (stopper_debug_level > 2) { tprintf("found ambiguity: "); UnicharIdArrayUtils::print( ambig_spec->correct_fragments, getUnicharset()); } ambigs_found = true; for (int tmp_index = 0; tmp_index <= wrong_ngram_index; ++tmp_index) { // Add a blob choice for the corresponding fragment of the // ambiguity. These fake blob choices are initialized with // negative ratings (which are not possible for real blob // choices), so that dawg_permute_and_select() considers any // word not consisting of only the original letters a better // choice and stops searching for alternatives once such a // choice is found. BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]); bc_it.add_to_end(new BLOB_CHOICE( ambig_spec->correct_fragments[tmp_index], -1.0, 0.0, -1, 0, 1, 0, BCC_AMBIG)); } } spec_it.forward(); } else if (compare == -1) { if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size && ((next_index = wrong_ngram_index+1+i) < best_choice->length())) { // Add the next unichar id to wrong_ngram and keep looking for // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST. wrong_ngram[++wrong_ngram_index] = best_choice->unichar_id(next_index); num_wrong_blobs += best_choice->state(next_index); } else { break; // no more matching ambigs in this AMBIG_SPEC_LIST } } else { spec_it.forward(); } } // end searching AmbigSpec_LIST } // end searching best_choice } // end searching replace and dangerous ambigs // If any ambiguities were found permute the constructed ambig_blob_choices // to see if an alternative dictionary word can be found. if (ambigs_found) { if (stopper_debug_level > 2) { tprintf("\nResulting ambig_blob_choices:\n"); for (i = 0; i < ambig_blob_choices.length(); ++i) { print_ratings_list("", ambig_blob_choices.get(i), getUnicharset()); tprintf("\n"); } } WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0); ambigs_found = (alt_word->rating() < 0.0); if (ambigs_found) { if (stopper_debug_level >= 1) { tprintf ("Stopper: Possible ambiguous word = %s\n", alt_word->debug_string().string()); } if (fixpt != nullptr) { // Note: Currently character choices combined from fragments can only // be generated by NoDangrousAmbigs(). This code should be updated if // the capability to produce classifications combined from character // fragments is added to other functions. int orig_i = 0; for (i = 0; i < alt_word->length(); ++i) { const UNICHARSET &uchset = getUnicharset(); bool replacement_is_ngram = uchset.get_isngram(alt_word->unichar_id(i)); UNICHAR_ID leftmost_id = alt_word->unichar_id(i); if (replacement_is_ngram) { // we have to extract the leftmost unichar from the ngram. const char *str = uchset.id_to_unichar(leftmost_id); int step = uchset.step(str); if (step) leftmost_id = uchset.unichar_to_id(str, step); } int end_i = orig_i + alt_word->state(i); if (alt_word->state(i) > 1 || (orig_i + 1 == end_i && replacement_is_ngram)) { // Compute proper blob indices. int blob_start = 0; for (int j = 0; j < orig_i; ++j) blob_start += best_choice->state(j); int blob_end = blob_start; for (int j = orig_i; j < end_i; ++j) blob_end += best_choice->state(j); fixpt->push_back(DANGERR_INFO(blob_start, blob_end, true, replacement_is_ngram, leftmost_id)); if (stopper_debug_level > 1) { tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i, true, replacement_is_ngram, uchset.id_to_unichar(leftmost_id)); } } orig_i += alt_word->state(i); } } } delete alt_word; } if (output_ambig_words_file_ != nullptr) { fprintf(output_ambig_words_file_, "\n"); } ambig_blob_choices.delete_data_pointers(); return !ambigs_found; }