bool LMPainPoints::GeneratePainPoint( int col, int row, LMPainPointsType pp_type, float special_priority, bool ok_to_extend, float max_char_wh_ratio, WERD_RES *word_res) { MATRIX_COORD coord(col, row); if (coord.Valid(*word_res->ratings) && word_res->ratings->Classified(col, row, dict_->WildcardID())) { return false; } if (debug_level_ > 3) { tprintf("Generating pain point for col=%d row=%d type=%s\n", col, row, LMPainPointsTypeName[pp_type]); } // Compute associate stats. AssociateStats associate_stats; AssociateUtils::ComputeStats(col, row, NULL, 0, fixed_pitch_, max_char_wh_ratio, word_res, debug_level_, &associate_stats); // For fixed-pitch fonts/languages: if the current combined blob overlaps // the next blob on the right and it is ok to extend the blob, try extending // the blob until there is no overlap with the next blob on the right or // until the width-to-height ratio becomes too large. if (ok_to_extend) { while (associate_stats.bad_fixed_pitch_right_gap && row + 1 < word_res->ratings->dimension() && !associate_stats.bad_fixed_pitch_wh_ratio) { AssociateUtils::ComputeStats(col, ++row, NULL, 0, fixed_pitch_, max_char_wh_ratio, word_res, debug_level_, &associate_stats); } } if (associate_stats.bad_shape) { if (debug_level_ > 3) { tprintf("Discarded pain point with a bad shape\n"); } return false; } // Insert the new pain point into pain_points_heap_. if (pain_points_heaps_[pp_type].size() < max_heap_size_) { // Compute pain point priority. float priority; if (pp_type == LM_PPTYPE_PATH) { priority = special_priority; } else { priority = associate_stats.gap_sum; } MatrixCoordPair pain_point(priority, MATRIX_COORD(col, row)); pain_points_heaps_[pp_type].Push(&pain_point); if (debug_level_) { tprintf("Added pain point with priority %g\n", priority); } return true; } else { if (debug_level_) tprintf("Pain points heap is full\n"); return false; } }
/** * @name improve_by_chopping * * Repeatedly chops the worst blob, classifying the new blobs fixing up all * the data, and incrementally runs the segmentation search until a good word * is found, or no more chops can be found. */ void Wordrec::improve_by_chopping(float rating_cert_scale, WERD_RES* word, BestChoiceBundle* best_choice_bundle, BlamerBundle* blamer_bundle, LMPainPoints* pain_points, GenericVector<SegSearchPending>* pending) { int blob_number; do { // improvement loop. // Make a simple vector of BLOB_CHOICEs to make it easy to pick which // one to chop. GenericVector<BLOB_CHOICE*> blob_choices; int num_blobs = word->ratings->dimension(); for (int i = 0; i < num_blobs; ++i) { BLOB_CHOICE_LIST* choices = word->ratings->get(i, i); if (choices == NULL || choices->empty()) { blob_choices.push_back(NULL); } else { BLOB_CHOICE_IT bc_it(choices); blob_choices.push_back(bc_it.data()); } } SEAM* seam = improve_one_blob(blob_choices, &best_choice_bundle->fixpt, false, false, word, &blob_number); if (seam == NULL) break; // A chop has been made. We have to correct all the data structures to // take into account the extra bottom-level blob. // Put the seam into the seam_array and correct everything else on the // word: ratings matrix (including matrix location in the BLOB_CHOICES), // states in WERD_CHOICEs, and blob widths. word->InsertSeam(blob_number, seam); // Insert a new entry in the beam array. best_choice_bundle->beam.insert(new LanguageModelState, blob_number); // Fixpts are outdated, but will get recalculated. best_choice_bundle->fixpt.clear(); // Remap existing pain points. pain_points->RemapForSplit(blob_number); // Insert a new pending at the chop point. pending->insert(SegSearchPending(), blob_number); // Classify the two newly created blobs using ProcessSegSearchPainPoint, // as that updates the pending correctly and adds new pain points. MATRIX_COORD pain_point(blob_number, blob_number); ProcessSegSearchPainPoint(0.0f, pain_point, "Chop1", pending, word, pain_points, blamer_bundle); pain_point.col = blob_number + 1; pain_point.row = blob_number + 1; ProcessSegSearchPainPoint(0.0f, pain_point, "Chop2", pending, word, pain_points, blamer_bundle); if (language_model_->language_model_ngram_on) { // N-gram evaluation depends on the number of blobs in a chunk, so we // have to re-evaluate everything in the word. ResetNGramSearch(word, best_choice_bundle, pending); blob_number = 0; } // Run language model incrementally. (Except with the n-gram model on.) UpdateSegSearchNodes(rating_cert_scale, blob_number, pending, word, pain_points, best_choice_bundle, blamer_bundle); } while (!language_model_->AcceptableChoiceFound() && word->ratings->dimension() < kMaxNumChunks); // If after running only the chopper best_choice is incorrect and no blame // has been yet set, blame the classifier if best_choice is classifier's // top choice and is a dictionary word (i.e. language model could not have // helped). Otherwise blame the tradeoff between the classifier and // the old language model (permuters). if (word->blamer_bundle != NULL && word->blamer_bundle->incorrect_result_reason() == IRR_CORRECT && !word->blamer_bundle->ChoiceIsCorrect(word->best_choice)) { bool valid_permuter = word->best_choice != NULL && Dict::valid_word_permuter(word->best_choice->permuter(), false); word->blamer_bundle->BlameClassifierOrLangModel(word, getDict().getUnicharset(), valid_permuter, wordrec_debug_blamer); } }