// Setup and run just the initial segsearch on an established matrix, // without doing any additional chopping or joining. // (Internal factored version that can be used as part of the main SegSearch.) void Wordrec::InitialSegSearch(WERD_RES* word_res, LMPainPoints* pain_points, GenericVector<SegSearchPending>* pending, BestChoiceBundle* best_choice_bundle, BlamerBundle* blamer_bundle) { if (segsearch_debug_level > 0) { tprintf("Starting SegSearch on ratings matrix%s:\n", wordrec_enable_assoc ? " (with assoc)" : ""); word_res->ratings->print(getDict().getUnicharset()); } pain_points->GenerateInitial(word_res); // Compute scaling factor that will help us recover blob outline length // from classifier rating and certainty for the blob. float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale; language_model_->InitForWord(prev_word_best_choice_, assume_fixed_pitch_char_segment, segsearch_max_char_wh_ratio, rating_cert_scale); // Initialize blamer-related information: map character boxes recorded in // blamer_bundle->norm_truth_word to the corresponding i,j indices in the // ratings matrix. We expect this step to succeed, since when running the // chopper we checked that the correct chops are present. if (blamer_bundle != NULL) { blamer_bundle->SetupCorrectSegmentation(word_res->chopped_word, wordrec_debug_blamer); } // pending[col] tells whether there is update work to do to combine // best_choice_bundle->beam[col - 1] with some BLOB_CHOICEs in matrix[col, *]. // As the language model state is updated, pending entries are modified to // minimize duplication of work. It is important that during the update the // children are considered in the non-decreasing order of their column, since // this guarantees that all the parents would be up to date before an update // of a child is done. pending->init_to_size(word_res->ratings->dimension(), SegSearchPending()); // Search the ratings matrix for the initial best path. (*pending)[0].SetColumnClassified(); UpdateSegSearchNodes(rating_cert_scale, 0, pending, word_res, pain_points, best_choice_bundle, blamer_bundle); }
/** * @name improve_by_chopping * * Repeatedly chops the worst blob, classifying the new blobs fixing up all * the data, and incrementally runs the segmentation search until a good word * is found, or no more chops can be found. */ void Wordrec::improve_by_chopping(float rating_cert_scale, WERD_RES* word, BestChoiceBundle* best_choice_bundle, BlamerBundle* blamer_bundle, LMPainPoints* pain_points, GenericVector<SegSearchPending>* pending) { int blob_number; do { // improvement loop. // Make a simple vector of BLOB_CHOICEs to make it easy to pick which // one to chop. GenericVector<BLOB_CHOICE*> blob_choices; int num_blobs = word->ratings->dimension(); for (int i = 0; i < num_blobs; ++i) { BLOB_CHOICE_LIST* choices = word->ratings->get(i, i); if (choices == NULL || choices->empty()) { blob_choices.push_back(NULL); } else { BLOB_CHOICE_IT bc_it(choices); blob_choices.push_back(bc_it.data()); } } SEAM* seam = improve_one_blob(blob_choices, &best_choice_bundle->fixpt, false, false, word, &blob_number); if (seam == NULL) break; // A chop has been made. We have to correct all the data structures to // take into account the extra bottom-level blob. // Put the seam into the seam_array and correct everything else on the // word: ratings matrix (including matrix location in the BLOB_CHOICES), // states in WERD_CHOICEs, and blob widths. word->InsertSeam(blob_number, seam); // Insert a new entry in the beam array. best_choice_bundle->beam.insert(new LanguageModelState, blob_number); // Fixpts are outdated, but will get recalculated. best_choice_bundle->fixpt.clear(); // Remap existing pain points. pain_points->RemapForSplit(blob_number); // Insert a new pending at the chop point. pending->insert(SegSearchPending(), blob_number); // Classify the two newly created blobs using ProcessSegSearchPainPoint, // as that updates the pending correctly and adds new pain points. MATRIX_COORD pain_point(blob_number, blob_number); ProcessSegSearchPainPoint(0.0f, pain_point, "Chop1", pending, word, pain_points, blamer_bundle); pain_point.col = blob_number + 1; pain_point.row = blob_number + 1; ProcessSegSearchPainPoint(0.0f, pain_point, "Chop2", pending, word, pain_points, blamer_bundle); if (language_model_->language_model_ngram_on) { // N-gram evaluation depends on the number of blobs in a chunk, so we // have to re-evaluate everything in the word. ResetNGramSearch(word, best_choice_bundle, pending); blob_number = 0; } // Run language model incrementally. (Except with the n-gram model on.) UpdateSegSearchNodes(rating_cert_scale, blob_number, pending, word, pain_points, best_choice_bundle, blamer_bundle); } while (!language_model_->AcceptableChoiceFound() && word->ratings->dimension() < kMaxNumChunks); // If after running only the chopper best_choice is incorrect and no blame // has been yet set, blame the classifier if best_choice is classifier's // top choice and is a dictionary word (i.e. language model could not have // helped). Otherwise blame the tradeoff between the classifier and // the old language model (permuters). if (word->blamer_bundle != NULL && word->blamer_bundle->incorrect_result_reason() == IRR_CORRECT && !word->blamer_bundle->ChoiceIsCorrect(word->best_choice)) { bool valid_permuter = word->best_choice != NULL && Dict::valid_word_permuter(word->best_choice->permuter(), false); word->blamer_bundle->BlameClassifierOrLangModel(word, getDict().getUnicharset(), valid_permuter, wordrec_debug_blamer); } }
void Wordrec::SegSearch(CHUNKS_RECORD *chunks_record, WERD_CHOICE *best_choice, BLOB_CHOICE_LIST_VECTOR *best_char_choices, WERD_CHOICE *raw_choice, STATE *output_best_state) { int row, col = 0; if (segsearch_debug_level > 0) { tprintf("Starting SegSearch on ratings matrix:\n"); chunks_record->ratings->print(getDict().getUnicharset()); } // Start with a fresh best_choice since rating adjustments // used by the chopper and the new segmentation search are not compatible. best_choice->set_rating(WERD_CHOICE::kBadRating); // Clear best choice accumulator (that is used for adaption), so that // choices adjusted by chopper do not interfere with the results from the // segmentation search. getDict().ClearBestChoiceAccum(); MATRIX *ratings = chunks_record->ratings; // Priority queue containing pain points generated by the language model // The priority is set by the language model components, adjustments like // seam cost and width priority are factored into the priority. HEAP *pain_points = MakeHeap(segsearch_max_pain_points); // best_path_by_column records the lowest cost path found so far for each // column of the chunks_record->ratings matrix over all the rows. BestPathByColumn *best_path_by_column = new BestPathByColumn[ratings->dimension()]; for (col = 0; col < ratings->dimension(); ++col) { best_path_by_column[col].avg_cost = WERD_CHOICE::kBadRating; best_path_by_column[col].best_vse = NULL; } language_model_->InitForWord(prev_word_best_choice_, &denorm_, assume_fixed_pitch_char_segment, best_choice->certainty(), segsearch_max_char_wh_ratio, pain_points, chunks_record); MATRIX_COORD *pain_point; float pain_point_priority; BestChoiceBundle best_choice_bundle( output_best_state, best_choice, raw_choice, best_char_choices); // pending[i] stores a list of the parent/child pair of BLOB_CHOICE_LISTs, // where i is the column of the child. Initially all the classified entries // in the ratings matrix from column 0 (with parent NULL) are inserted into // pending[0]. As the language model state is updated, new child/parent // pairs are inserted into the lists. Next, the entries in pending[1] are // considered, and so on. It is important that during the update the // children are considered in the non-decreasing order of their column, since // this guarantess that all the parents would be up to date before an update // of a child is done. SEG_SEARCH_PENDING_LIST *pending = new SEG_SEARCH_PENDING_LIST[ratings->dimension()]; // Search for the ratings matrix for the initial best path. for (row = 0; row < ratings->dimension(); ++row) { if (ratings->get(0, row) != NOT_CLASSIFIED) { pending[0].add_sorted( SEG_SEARCH_PENDING::compare, true, new SEG_SEARCH_PENDING(row, NULL, LanguageModel::kAllChangedFlag)); } } UpdateSegSearchNodes(0, &pending, &best_path_by_column, chunks_record, pain_points, &best_choice_bundle); // Keep trying to find a better path by fixing the "pain points". int num_futile_classifications = 0; while (!(language_model_->AcceptableChoiceFound() || num_futile_classifications >= segsearch_max_futile_classifications)) { // Get the next valid "pain point". int pop; while (true) { pop = HeapPop(pain_points, &pain_point_priority, &pain_point); if (pop == EMPTY) break; if (pain_point->Valid(*ratings) && ratings->get(pain_point->col, pain_point->row) == NOT_CLASSIFIED) { break; } else { delete pain_point; } } if (pop == EMPTY) { if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n"); break; } if (segsearch_debug_level > 0) { tprintf("Classifying pain point priority=%.4f, col=%d, row=%d\n", pain_point_priority, pain_point->col, pain_point->row); } BLOB_CHOICE_LIST *classified = classify_piece( chunks_record->chunks, chunks_record->splits, pain_point->col, pain_point->row); ratings->put(pain_point->col, pain_point->row, classified); if (segsearch_debug_level > 0) { print_ratings_list("Updated ratings matrix with a new entry:", ratings->get(pain_point->col, pain_point->row), getDict().getUnicharset()); chunks_record->ratings->print(getDict().getUnicharset()); } // Insert initial "pain points" to join the newly classified blob // with its left and right neighbors. if (!classified->empty()) { float worst_piece_cert; bool fragmented; if (pain_point->col > 0) { language_model_->GetWorstPieceCertainty( pain_point->col-1, pain_point->row, chunks_record->ratings, &worst_piece_cert, &fragmented); language_model_->GeneratePainPoint( pain_point->col-1, pain_point->row, false, LanguageModel::kInitialPainPointPriorityAdjustment, worst_piece_cert, fragmented, best_choice->certainty(), segsearch_max_char_wh_ratio, NULL, NULL, chunks_record, pain_points); } if (pain_point->row+1 < ratings->dimension()) { language_model_->GetWorstPieceCertainty( pain_point->col, pain_point->row+1, chunks_record->ratings, &worst_piece_cert, &fragmented); language_model_->GeneratePainPoint( pain_point->col, pain_point->row+1, true, LanguageModel::kInitialPainPointPriorityAdjustment, worst_piece_cert, fragmented, best_choice->certainty(), segsearch_max_char_wh_ratio, NULL, NULL, chunks_record, pain_points); } } // Record a pending entry with the pain_point and each of its parents. int parent_row = pain_point->col - 1; if (parent_row < 0) { // this node has no parents pending[pain_point->col].add_sorted( SEG_SEARCH_PENDING::compare, true, new SEG_SEARCH_PENDING(pain_point->row, NULL, LanguageModel::kAllChangedFlag)); } else { for (int parent_col = 0; parent_col < pain_point->col; ++parent_col) { if (ratings->get(parent_col, parent_row) != NOT_CLASSIFIED) { pending[pain_point->col].add_sorted( SEG_SEARCH_PENDING::compare, true, new SEG_SEARCH_PENDING(pain_point->row, ratings->get(parent_col, parent_row), LanguageModel::kAllChangedFlag)); } } } UpdateSegSearchNodes(pain_point->col, &pending, &best_path_by_column, chunks_record, pain_points, &best_choice_bundle); if (!best_choice_bundle.updated) ++num_futile_classifications; if (segsearch_debug_level > 0) { tprintf("num_futile_classifications %d\n", num_futile_classifications); } // Clean up best_choice_bundle.updated = false; delete pain_point; // done using this pain point } if (segsearch_debug_level > 0) { tprintf("Done with SegSearch (AcceptableChoiceFound: %d\n", language_model_->AcceptableChoiceFound()); } // Clean up. FreeHeapData(pain_points, MATRIX_COORD::Delete); delete[] best_path_by_column; delete[] pending; for (row = 0; row < ratings->dimension(); ++row) { for (col = 0; col <= row; ++col) { BLOB_CHOICE_LIST *rating = ratings->get(col, row); if (rating != NOT_CLASSIFIED) language_model_->DeleteState(rating); } } }
void Wordrec::SegSearch(WERD_RES* word_res, BestChoiceBundle* best_choice_bundle, BlamerBundle* blamer_bundle) { LMPainPoints pain_points(segsearch_max_pain_points, segsearch_max_char_wh_ratio, assume_fixed_pitch_char_segment, &getDict(), segsearch_debug_level); // Compute scaling factor that will help us recover blob outline length // from classifier rating and certainty for the blob. float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale; GenericVector<SegSearchPending> pending; InitialSegSearch(word_res, &pain_points, &pending, best_choice_bundle, blamer_bundle); if (!SegSearchDone(0)) { // find a better choice if (chop_enable && word_res->chopped_word != NULL) { improve_by_chopping(rating_cert_scale, word_res, best_choice_bundle, blamer_bundle, &pain_points, &pending); } if (chop_debug) SEAM::PrintSeams("Final seam list:", word_res->seam_array); if (blamer_bundle != NULL && !blamer_bundle->ChoiceIsCorrect(word_res->best_choice)) { blamer_bundle->SetChopperBlame(word_res, wordrec_debug_blamer); } } // Keep trying to find a better path by fixing the "pain points". MATRIX_COORD pain_point; float pain_point_priority; int num_futile_classifications = 0; STRING blamer_debug; while (wordrec_enable_assoc && (!SegSearchDone(num_futile_classifications) || (blamer_bundle != NULL && blamer_bundle->GuidedSegsearchStillGoing()))) { // Get the next valid "pain point". bool found_nothing = true; LMPainPointsType pp_type; while ((pp_type = pain_points.Deque(&pain_point, &pain_point_priority)) != LM_PPTYPE_NUM) { if (!pain_point.Valid(*word_res->ratings)) { word_res->ratings->IncreaseBandSize( pain_point.row - pain_point.col + 1); } if (pain_point.Valid(*word_res->ratings) && !word_res->ratings->Classified(pain_point.col, pain_point.row, getDict().WildcardID())) { found_nothing = false; break; } } if (found_nothing) { if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n"); break; } ProcessSegSearchPainPoint(pain_point_priority, pain_point, LMPainPoints::PainPointDescription(pp_type), &pending, word_res, &pain_points, blamer_bundle); UpdateSegSearchNodes(rating_cert_scale, pain_point.col, &pending, word_res, &pain_points, best_choice_bundle, blamer_bundle); if (!best_choice_bundle->updated) ++num_futile_classifications; if (segsearch_debug_level > 0) { tprintf("num_futile_classifications %d\n", num_futile_classifications); } best_choice_bundle->updated = false; // reset updated // See if it's time to terminate SegSearch or time for starting a guided // search for the true path to find the blame for the incorrect best_choice. if (SegSearchDone(num_futile_classifications) && blamer_bundle != NULL && blamer_bundle->GuidedSegsearchNeeded(word_res->best_choice)) { InitBlamerForSegSearch(word_res, &pain_points, blamer_bundle, &blamer_debug); } } // end while loop exploring alternative paths if (blamer_bundle != NULL) { blamer_bundle->FinishSegSearch(word_res->best_choice, wordrec_debug_blamer, &blamer_debug); } if (segsearch_debug_level > 0) { tprintf("Done with SegSearch (AcceptableChoiceFound: %d)\n", language_model_->AcceptableChoiceFound()); } }