void Wordrec::ProcessSegSearchPainPoint( float pain_point_priority, const MATRIX_COORD &pain_point, const char* pain_point_type, GenericVector<SegSearchPending>* pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle) { if (segsearch_debug_level > 0) { tprintf("Classifying pain point %s priority=%.4f, col=%d, row=%d\n", pain_point_type, pain_point_priority, pain_point.col, pain_point.row); } ASSERT_HOST(pain_points != NULL); MATRIX *ratings = word_res->ratings; // Classify blob [pain_point.col pain_point.row] if (!pain_point.Valid(*ratings)) { ratings->IncreaseBandSize(pain_point.row + 1 - pain_point.col); } ASSERT_HOST(pain_point.Valid(*ratings)); BLOB_CHOICE_LIST *classified = classify_piece(word_res->seam_array, pain_point.col, pain_point.row, pain_point_type, word_res->chopped_word, blamer_bundle); BLOB_CHOICE_LIST *lst = ratings->get(pain_point.col, pain_point.row); if (lst == NULL) { ratings->put(pain_point.col, pain_point.row, classified); } else { // We can not delete old BLOB_CHOICEs, since they might contain // ViterbiStateEntries that are parents of other "active" entries. // Thus if the matrix cell already contains classifications we add // the new ones to the beginning of the list. BLOB_CHOICE_IT it(lst); it.add_list_before(classified); delete classified; // safe to delete, since empty after add_list_before() classified = NULL; } if (segsearch_debug_level > 0) { print_ratings_list("Updated ratings matrix with a new entry:", ratings->get(pain_point.col, pain_point.row), getDict().getUnicharset()); ratings->print(getDict().getUnicharset()); } // Insert initial "pain points" to join the newly classified blob // with its left and right neighbors. if (classified != NULL && !classified->empty()) { if (pain_point.col > 0) { pain_points->GeneratePainPoint( pain_point.col - 1, pain_point.row, LM_PPTYPE_SHAPE, 0.0, true, segsearch_max_char_wh_ratio, word_res); } if (pain_point.row + 1 < ratings->dimension()) { pain_points->GeneratePainPoint( pain_point.col, pain_point.row + 1, LM_PPTYPE_SHAPE, 0.0, true, segsearch_max_char_wh_ratio, word_res); } } (*pending)[pain_point.col].SetBlobClassified(pain_point.row); }
void LMPainPoints::GenerateInitial(WERD_RES *word_res) { MATRIX *ratings = word_res->ratings; AssociateStats associate_stats; for (int col = 0; col < ratings->dimension(); ++col) { int row_end = MIN(ratings->dimension(), col + ratings->bandwidth() + 1); for (int row = col + 1; row < row_end; ++row) { MATRIX_COORD coord(col, row); if (coord.Valid(*ratings) && ratings->get(col, row) != NOT_CLASSIFIED) continue; // Add an initial pain point if needed. if (ratings->Classified(col, row - 1, dict_->WildcardID()) || (col + 1 < ratings->dimension() && ratings->Classified(col + 1, row, dict_->WildcardID()))) { GeneratePainPoint(col, row, LM_PPTYPE_SHAPE, 0.0, true, max_char_wh_ratio_, word_res); } } } }
void Wordrec::SegSearch(CHUNKS_RECORD *chunks_record, WERD_CHOICE *best_choice, BLOB_CHOICE_LIST_VECTOR *best_char_choices, WERD_CHOICE *raw_choice, STATE *output_best_state) { int row, col = 0; if (segsearch_debug_level > 0) { tprintf("Starting SegSearch on ratings matrix:\n"); chunks_record->ratings->print(getDict().getUnicharset()); } // Start with a fresh best_choice since rating adjustments // used by the chopper and the new segmentation search are not compatible. best_choice->set_rating(WERD_CHOICE::kBadRating); // Clear best choice accumulator (that is used for adaption), so that // choices adjusted by chopper do not interfere with the results from the // segmentation search. getDict().ClearBestChoiceAccum(); MATRIX *ratings = chunks_record->ratings; // Priority queue containing pain points generated by the language model // The priority is set by the language model components, adjustments like // seam cost and width priority are factored into the priority. HEAP *pain_points = MakeHeap(segsearch_max_pain_points); // best_path_by_column records the lowest cost path found so far for each // column of the chunks_record->ratings matrix over all the rows. BestPathByColumn *best_path_by_column = new BestPathByColumn[ratings->dimension()]; for (col = 0; col < ratings->dimension(); ++col) { best_path_by_column[col].avg_cost = WERD_CHOICE::kBadRating; best_path_by_column[col].best_vse = NULL; } language_model_->InitForWord(prev_word_best_choice_, &denorm_, assume_fixed_pitch_char_segment, best_choice->certainty(), segsearch_max_char_wh_ratio, pain_points, chunks_record); MATRIX_COORD *pain_point; float pain_point_priority; BestChoiceBundle best_choice_bundle( output_best_state, best_choice, raw_choice, best_char_choices); // pending[i] stores a list of the parent/child pair of BLOB_CHOICE_LISTs, // where i is the column of the child. Initially all the classified entries // in the ratings matrix from column 0 (with parent NULL) are inserted into // pending[0]. As the language model state is updated, new child/parent // pairs are inserted into the lists. Next, the entries in pending[1] are // considered, and so on. It is important that during the update the // children are considered in the non-decreasing order of their column, since // this guarantess that all the parents would be up to date before an update // of a child is done. SEG_SEARCH_PENDING_LIST *pending = new SEG_SEARCH_PENDING_LIST[ratings->dimension()]; // Search for the ratings matrix for the initial best path. for (row = 0; row < ratings->dimension(); ++row) { if (ratings->get(0, row) != NOT_CLASSIFIED) { pending[0].add_sorted( SEG_SEARCH_PENDING::compare, true, new SEG_SEARCH_PENDING(row, NULL, LanguageModel::kAllChangedFlag)); } } UpdateSegSearchNodes(0, &pending, &best_path_by_column, chunks_record, pain_points, &best_choice_bundle); // Keep trying to find a better path by fixing the "pain points". int num_futile_classifications = 0; while (!(language_model_->AcceptableChoiceFound() || num_futile_classifications >= segsearch_max_futile_classifications)) { // Get the next valid "pain point". int pop; while (true) { pop = HeapPop(pain_points, &pain_point_priority, &pain_point); if (pop == EMPTY) break; if (pain_point->Valid(*ratings) && ratings->get(pain_point->col, pain_point->row) == NOT_CLASSIFIED) { break; } else { delete pain_point; } } if (pop == EMPTY) { if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n"); break; } if (segsearch_debug_level > 0) { tprintf("Classifying pain point priority=%.4f, col=%d, row=%d\n", pain_point_priority, pain_point->col, pain_point->row); } BLOB_CHOICE_LIST *classified = classify_piece( chunks_record->chunks, chunks_record->splits, pain_point->col, pain_point->row); ratings->put(pain_point->col, pain_point->row, classified); if (segsearch_debug_level > 0) { print_ratings_list("Updated ratings matrix with a new entry:", ratings->get(pain_point->col, pain_point->row), getDict().getUnicharset()); chunks_record->ratings->print(getDict().getUnicharset()); } // Insert initial "pain points" to join the newly classified blob // with its left and right neighbors. if (!classified->empty()) { float worst_piece_cert; bool fragmented; if (pain_point->col > 0) { language_model_->GetWorstPieceCertainty( pain_point->col-1, pain_point->row, chunks_record->ratings, &worst_piece_cert, &fragmented); language_model_->GeneratePainPoint( pain_point->col-1, pain_point->row, false, LanguageModel::kInitialPainPointPriorityAdjustment, worst_piece_cert, fragmented, best_choice->certainty(), segsearch_max_char_wh_ratio, NULL, NULL, chunks_record, pain_points); } if (pain_point->row+1 < ratings->dimension()) { language_model_->GetWorstPieceCertainty( pain_point->col, pain_point->row+1, chunks_record->ratings, &worst_piece_cert, &fragmented); language_model_->GeneratePainPoint( pain_point->col, pain_point->row+1, true, LanguageModel::kInitialPainPointPriorityAdjustment, worst_piece_cert, fragmented, best_choice->certainty(), segsearch_max_char_wh_ratio, NULL, NULL, chunks_record, pain_points); } } // Record a pending entry with the pain_point and each of its parents. int parent_row = pain_point->col - 1; if (parent_row < 0) { // this node has no parents pending[pain_point->col].add_sorted( SEG_SEARCH_PENDING::compare, true, new SEG_SEARCH_PENDING(pain_point->row, NULL, LanguageModel::kAllChangedFlag)); } else { for (int parent_col = 0; parent_col < pain_point->col; ++parent_col) { if (ratings->get(parent_col, parent_row) != NOT_CLASSIFIED) { pending[pain_point->col].add_sorted( SEG_SEARCH_PENDING::compare, true, new SEG_SEARCH_PENDING(pain_point->row, ratings->get(parent_col, parent_row), LanguageModel::kAllChangedFlag)); } } } UpdateSegSearchNodes(pain_point->col, &pending, &best_path_by_column, chunks_record, pain_points, &best_choice_bundle); if (!best_choice_bundle.updated) ++num_futile_classifications; if (segsearch_debug_level > 0) { tprintf("num_futile_classifications %d\n", num_futile_classifications); } // Clean up best_choice_bundle.updated = false; delete pain_point; // done using this pain point } if (segsearch_debug_level > 0) { tprintf("Done with SegSearch (AcceptableChoiceFound: %d\n", language_model_->AcceptableChoiceFound()); } // Clean up. FreeHeapData(pain_points, MATRIX_COORD::Delete); delete[] best_path_by_column; delete[] pending; for (row = 0; row < ratings->dimension(); ++row) { for (col = 0; col <= row; ++col) { BLOB_CHOICE_LIST *rating = ratings->get(col, row); if (rating != NOT_CLASSIFIED) language_model_->DeleteState(rating); } } }
void Wordrec::UpdateSegSearchNodes( int starting_col, SEG_SEARCH_PENDING_LIST *pending[], BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record, HEAP *pain_points, BestChoiceBundle *best_choice_bundle) { MATRIX *ratings = chunks_record->ratings; for (int col = starting_col; col < ratings->dimension(); ++col) { if (segsearch_debug_level > 0) { tprintf("\n\nUpdateSegSearchNodes: evaluate children in col=%d\n", col); } // Iterate over the pending list for this column. SEG_SEARCH_PENDING_LIST *pending_list = &((*pending)[col]); SEG_SEARCH_PENDING_IT pending_it(pending_list); GenericVector<int> non_empty_rows; while (!pending_it.empty()) { // Update language model state of this child+parent pair. SEG_SEARCH_PENDING *p = pending_it.extract(); if (non_empty_rows.length() == 0 || non_empty_rows[non_empty_rows.length()-1] != p->child_row) { non_empty_rows.push_back(p->child_row); } BLOB_CHOICE_LIST *current_node = ratings->get(col, p->child_row); LanguageModelFlagsType new_changed = language_model_->UpdateState(p->changed, col, p->child_row, current_node, p->parent, pain_points, best_path_by_column, chunks_record, best_choice_bundle); if (new_changed) { // Since the language model state of this entry changed, add all the // pairs with it as a parent and each of its children to pending, so // that the children are updated as well. int child_col = p->child_row + 1; for (int child_row = child_col; child_row < ratings->dimension(); ++child_row) { if (ratings->get(child_col, child_row) != NOT_CLASSIFIED) { SEG_SEARCH_PENDING *new_pending = new SEG_SEARCH_PENDING(child_row, current_node, 0); SEG_SEARCH_PENDING *actual_new_pending = reinterpret_cast<SEG_SEARCH_PENDING *>( (*pending)[child_col].add_sorted_and_find( SEG_SEARCH_PENDING::compare, true, new_pending)); if (new_pending != actual_new_pending) delete new_pending; actual_new_pending->changed |= new_changed; if (segsearch_debug_level > 0) { tprintf("Added child(col=%d row=%d) parent(col=%d row=%d)" " changed=0x%x to pending\n", child_col, actual_new_pending->child_row, col, p->child_row, actual_new_pending->changed); } } } } // end if new_changed delete p; // clean up pending_it.forward(); } // end while !pending_it.empty() language_model_->GeneratePainPointsFromColumn( col, non_empty_rows, best_choice_bundle->best_choice->certainty(), pain_points, best_path_by_column, chunks_record); } // end for col if (best_choice_bundle->updated) { language_model_->GeneratePainPointsFromBestChoice( pain_points, chunks_record, best_choice_bundle); } language_model_->CleanUp(); }
void Wordrec::UpdateSegSearchNodes( float rating_cert_scale, int starting_col, GenericVector<SegSearchPending>* pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) { MATRIX *ratings = word_res->ratings; ASSERT_HOST(ratings->dimension() == pending->size()); ASSERT_HOST(ratings->dimension() == best_choice_bundle->beam.size()); for (int col = starting_col; col < ratings->dimension(); ++col) { if (!(*pending)[col].WorkToDo()) continue; int first_row = col; int last_row = MIN(ratings->dimension() - 1, col + ratings->bandwidth() - 1); if ((*pending)[col].SingleRow() >= 0) { first_row = last_row = (*pending)[col].SingleRow(); } if (segsearch_debug_level > 0) { tprintf("\n\nUpdateSegSearchNodes: col=%d, rows=[%d,%d], alljust=%d\n", col, first_row, last_row, (*pending)[col].IsRowJustClassified(INT32_MAX)); } // Iterate over the pending list for this column. for (int row = first_row; row <= last_row; ++row) { // Update language model state of this child+parent pair. BLOB_CHOICE_LIST *current_node = ratings->get(col, row); LanguageModelState *parent_node = col == 0 ? NULL : best_choice_bundle->beam[col - 1]; if (current_node != NULL && language_model_->UpdateState((*pending)[col].IsRowJustClassified(row), col, row, current_node, parent_node, pain_points, word_res, best_choice_bundle, blamer_bundle) && row + 1 < ratings->dimension()) { // Since the language model state of this entry changed, process all // the child column. (*pending)[row + 1].RevisitWholeColumn(); if (segsearch_debug_level > 0) { tprintf("Added child col=%d to pending\n", row + 1); } } // end if UpdateState. } // end for row. } // end for col. if (best_choice_bundle->best_vse != NULL) { ASSERT_HOST(word_res->StatesAllValid()); if (best_choice_bundle->best_vse->updated) { pain_points->GenerateFromPath(rating_cert_scale, best_choice_bundle->best_vse, word_res); if (!best_choice_bundle->fixpt.empty()) { pain_points->GenerateFromAmbigs(best_choice_bundle->fixpt, best_choice_bundle->best_vse, word_res); } } } // The segsearch is completed. Reset all updated flags on all VSEs and reset // all pendings. for (int col = 0; col < pending->size(); ++col) { (*pending)[col].Clear(); ViterbiStateEntry_IT vse_it(&best_choice_bundle->beam[col]->viterbi_state_entries); for (vse_it.mark_cycle_pt(); !vse_it.cycled_list(); vse_it.forward()) { vse_it.data()->updated = false; } } }