// Sets flags necessary for recognition in the training mode. // Opens and returns the pointer to the output file. FILE *Tesseract::init_recog_training(const STRING &fname) { if (tessedit_ambigs_training) { tessedit_tess_adaption_mode.set_value(0); // turn off adaption tessedit_enable_doc_dict.set_value(0); // turn off document dictionary // Explore all segmentations. getDict().stopper_no_acceptable_choices.set_value(1); } STRING output_fname = fname; const char *lastdot = strrchr(output_fname.string(), '.'); if (lastdot != NULL) output_fname[lastdot - output_fname.string()] = '\0'; output_fname += ".txt"; FILE *output_file = open_file(output_fname.string(), "a+"); return output_file; }
int main(int argc, char** argv) { FILE* fp; // declare dict file char* programName = argv[0]; char* dictName = argv[1]; char* input = (char*)malloc(sizeof(char) * MAXLINE); char* oldInput = (char*)malloc(sizeof(char) * MAXLINE); if ((input == NULL) || (oldInput == NULL)) fprintf(stderr, MLCFAIL); if ((fp = fopen(dictName, "r")) == NULL) { fprintf(stderr, "%s: %s: No such file or directory\n", programName, dictName); // exit if dict is not loaded properly return 1; } //build tree from dict struct node* root = newNode(); root = getDict(fp, dictName, root); fclose(fp); // T9 interaction: printf("Enter \"exit\" to quit.\n"); while (1) { printf("Enter Key Sequence (or \"#\" for next word) :\n"); scanf("%s", input); int len = strlen(input); if (!strncmp(input, "exit", len)) break; if (!strncmp(input, "#", 1)) strcat(oldInput, input); else strcpy(oldInput, input); char* word = searchTrie(root, oldInput, strlen(oldInput), 0); printf("\t%s\n", word); free(word); //free alloc from appendquotes } // clean up free(input); free(oldInput); deleteTrie(root); return 0; }
/** * @name program_editup * * Initialize all the things in the program that need to be initialized. * init_permute determines whether to initialize the permute functions * and Dawg models. */ void Wordrec::program_editup(const char *textbase, bool init_permute) { if (textbase != NULL) { imagefile = textbase; /* Read in data files */ edit_with_ocr(textbase); } /* Initialize subsystems */ program_init(); mfeature_init(); // assumes that imagefile is initialized if (init_permute) getDict().init_permute(); setup_cp_maps(); init_metrics(); pass2_ok_split = chop_ok_split; pass2_seg_states = wordrec_num_seg_states; }
// Runs classify_word_pass1() on the current word. Outputs Tesseract's // raw choice as a result of the classification. For words labeled with a // single unichar also outputs all alternatives from blob_choices of the // best choice. void Tesseract::ambigs_classify_and_output(WERD_RES *werd_res, ROW_RES *row_res, BLOCK_RES *block_res, const char *label, FILE *output_file) { int offset; // Classify word. fflush(stdout); classify_word_pass1(block_res->block, row_res->row, werd_res); WERD_CHOICE *best_choice = werd_res->best_choice; ASSERT_HOST(best_choice != NULL); ASSERT_HOST(best_choice->blob_choices() != NULL); // Compute the number of unichars in the label. int label_num_unichars = 0; int step = 1; // should be non-zero on the first iteration for (offset = 0; label[offset] != '\0' && step > 0; step = werd_res->uch_set->step(label + offset), offset += step, ++label_num_unichars); if (step == 0) { tprintf("Not outputting illegal unichar %s\n", label); return; } // Output all classifier choices for the unigrams (1->1 classifications). if (label_num_unichars == 1 && best_choice->blob_choices()->length() == 1) { BLOB_CHOICE_LIST_C_IT outer_blob_choice_it; outer_blob_choice_it.set_to_list(best_choice->blob_choices()); BLOB_CHOICE_IT blob_choice_it; blob_choice_it.set_to_list(outer_blob_choice_it.data()); for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); blob_choice_it.forward()) { BLOB_CHOICE *blob_choice = blob_choice_it.data(); if (blob_choice->unichar_id() != INVALID_UNICHAR_ID) { fprintf(output_file, "%s\t%s\t%.4f\t%.4f\n", unicharset.id_to_unichar(blob_choice->unichar_id()), label, blob_choice->rating(), blob_choice->certainty()); } } } // Output raw choices for many->many and 1->many classifications. getDict().PrintAmbigAlternatives(output_file, label, label_num_unichars); }
DECLARE_EXPORT void Skill::writeElement(XMLOutput *o, const Keyword& tag, mode m) const { // Write a reference if (m == REFERENCE) { o->writeElement(tag, Tags::tag_name, getName()); return; } // Write the head if (m != NOHEAD && m != NOHEADTAIL) o->BeginObject(tag, Tags::tag_name, XMLEscape(getName())); // Write source field o->writeElement(Tags::tag_source, getSource()); // Write the custom fields PythonDictionary::write(o, getDict()); // Write the tail if (m != NOHEADTAIL && m != NOTAIL) o->EndObject(tag); }
/********************************************************************** * get_piece_rating * * Check to see if this piece has already been classified. If it has * return that rating. Otherwise build the piece from the smaller * pieces, classify it, store the rating for later, and take the piece * apart again. **********************************************************************/ BLOB_CHOICE_LIST *Wordrec::get_piece_rating(MATRIX *ratings, TBLOB *blobs, const DENORM& denorm, SEAMS seams, inT16 start, inT16 end, BlamerBundle *blamer_bundle) { BLOB_CHOICE_LIST *choices = ratings->get(start, end); if (choices == NOT_CLASSIFIED) { choices = classify_piece(blobs, denorm, seams, start, end, blamer_bundle); ratings->put(start, end, choices); if (wordrec_debug_level > 1) { tprintf("get_piece_rating(): updated ratings matrix\n"); ratings->print(getDict().getUnicharset()); } } return (choices); }
// Clear the document dictionary for this and all subclassifiers. void Tesseract::ResetDocumentDictionary() { getDict().ResetDocumentDictionary(); for (int i = 0; i < sub_langs_.size(); ++i) { sub_langs_[i]->getDict().ResetDocumentDictionary(); } }
/** * @name program_editdown * * This function holds any necessary post processing for the Wise Owl * program. */ void Wordrec::program_editdown(int32_t elasped_time) { #ifndef DISABLED_LEGACY_ENGINE EndAdaptiveClassifier(); #endif // ndef DISABLED_LEGACY_ENGINE getDict().End(); }
/** * @name dict_word() * * Test the dictionaries, returning NO_PERM (0) if not found, or one * of the PermuterType values if found, according to the dictionary. */ int Wordrec::dict_word(const WERD_CHOICE &word) { return getDict().valid_word(word); }
/** * @name improve_by_chopping * * Repeatedly chops the worst blob, classifying the new blobs fixing up all * the data, and incrementally runs the segmentation search until a good word * is found, or no more chops can be found. */ void Wordrec::improve_by_chopping(float rating_cert_scale, WERD_RES* word, BestChoiceBundle* best_choice_bundle, BlamerBundle* blamer_bundle, LMPainPoints* pain_points, GenericVector<SegSearchPending>* pending) { int blob_number; do { // improvement loop. // Make a simple vector of BLOB_CHOICEs to make it easy to pick which // one to chop. GenericVector<BLOB_CHOICE*> blob_choices; int num_blobs = word->ratings->dimension(); for (int i = 0; i < num_blobs; ++i) { BLOB_CHOICE_LIST* choices = word->ratings->get(i, i); if (choices == NULL || choices->empty()) { blob_choices.push_back(NULL); } else { BLOB_CHOICE_IT bc_it(choices); blob_choices.push_back(bc_it.data()); } } SEAM* seam = improve_one_blob(blob_choices, &best_choice_bundle->fixpt, false, false, word, &blob_number); if (seam == NULL) break; // A chop has been made. We have to correct all the data structures to // take into account the extra bottom-level blob. // Put the seam into the seam_array and correct everything else on the // word: ratings matrix (including matrix location in the BLOB_CHOICES), // states in WERD_CHOICEs, and blob widths. word->InsertSeam(blob_number, seam); // Insert a new entry in the beam array. best_choice_bundle->beam.insert(new LanguageModelState, blob_number); // Fixpts are outdated, but will get recalculated. best_choice_bundle->fixpt.clear(); // Remap existing pain points. pain_points->RemapForSplit(blob_number); // Insert a new pending at the chop point. pending->insert(SegSearchPending(), blob_number); // Classify the two newly created blobs using ProcessSegSearchPainPoint, // as that updates the pending correctly and adds new pain points. MATRIX_COORD pain_point(blob_number, blob_number); ProcessSegSearchPainPoint(0.0f, pain_point, "Chop1", pending, word, pain_points, blamer_bundle); pain_point.col = blob_number + 1; pain_point.row = blob_number + 1; ProcessSegSearchPainPoint(0.0f, pain_point, "Chop2", pending, word, pain_points, blamer_bundle); if (language_model_->language_model_ngram_on) { // N-gram evaluation depends on the number of blobs in a chunk, so we // have to re-evaluate everything in the word. ResetNGramSearch(word, best_choice_bundle, pending); blob_number = 0; } // Run language model incrementally. (Except with the n-gram model on.) UpdateSegSearchNodes(rating_cert_scale, blob_number, pending, word, pain_points, best_choice_bundle, blamer_bundle); } while (!language_model_->AcceptableChoiceFound() && word->ratings->dimension() < kMaxNumChunks); // If after running only the chopper best_choice is incorrect and no blame // has been yet set, blame the classifier if best_choice is classifier's // top choice and is a dictionary word (i.e. language model could not have // helped). Otherwise blame the tradeoff between the classifier and // the old language model (permuters). if (word->blamer_bundle != NULL && word->blamer_bundle->incorrect_result_reason() == IRR_CORRECT && !word->blamer_bundle->ChoiceIsCorrect(word->best_choice)) { bool valid_permuter = word->best_choice != NULL && Dict::valid_word_permuter(word->best_choice->permuter(), false); word->blamer_bundle->BlameClassifierOrLangModel(word, getDict().getUnicharset(), valid_permuter, wordrec_debug_blamer); } }
/** * @name tess_acceptable_word * * @return true if the word is regarded as "good enough". * @param word_choice after context * @param raw_choice before context */ bool Tesseract::tess_acceptable_word(WERD_RES* word) { return getDict().AcceptableResult(word); }
/** * rebuild_current_state * * Transfers the given state to the word's output fields: rebuild_word, * best_state, box_word, and returns the corresponding blob choices. */ BLOB_CHOICE_LIST_VECTOR *Wordrec::rebuild_current_state( WERD_RES *word, STATE *state, BLOB_CHOICE_LIST_VECTOR *old_choices, MATRIX *ratings) { // Initialize search_state, num_joints, x, y. int num_joints = array_count(word->seam_array); #ifndef GRAPHICS_DISABLED if (wordrec_display_segmentations) { print_state("Rebuilding state", state, num_joints); } #endif // Setup the rebuild_word ready for the output blobs. if (word->rebuild_word != NULL) delete word->rebuild_word; word->rebuild_word = new TWERD; // Setup the best_state. word->best_state.clear(); SEARCH_STATE search_state = bin_to_chunks(state, num_joints); // See which index is which below for information on x and y. int x = 0; int y; for (int i = 1; i <= search_state[0]; i++) { y = x + search_state[i]; x = y + 1; } y = count_blobs(word->chopped_word->blobs) - 1; // Initialize char_choices, expanded_fragment_lengths: // e.g. if fragment_lengths = {1 1 2 3 1}, // expanded_fragment_lengths_str = {1 1 2 2 3 3 3 1}. BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR(); STRING expanded_fragment_lengths_str = ""; bool state_has_fragments = false; const char *fragment_lengths = NULL; if (word->best_choice->length() > 0) { fragment_lengths = word->best_choice->fragment_lengths(); } if (fragment_lengths) { for (int i = 0; i < word->best_choice->length(); ++i) { *char_choices += NULL; word->best_state.push_back(0); if (fragment_lengths[i] > 1) { state_has_fragments = true; } for (int j = 0; j < fragment_lengths[i]; ++j) { expanded_fragment_lengths_str += fragment_lengths[i]; } } } else { for (int i = 0; i <= search_state[0]; ++i) { expanded_fragment_lengths_str += (char)1; *char_choices += NULL; word->best_state.push_back(0); } } // Set up variables for concatenating fragments. const char *word_lengths_ptr = NULL; const char *word_ptr = NULL; if (state_has_fragments) { // Make word_lengths_ptr point to the last element in // best_choice->unichar_lengths(). word_lengths_ptr = word->best_choice->unichar_lengths().string(); word_lengths_ptr += (strlen(word_lengths_ptr)-1); // Make word_str point to the beginning of the last // unichar in best_choice->unichar_string(). word_ptr = word->best_choice->unichar_string().string(); word_ptr += (strlen(word_ptr)-*word_lengths_ptr); } const char *expanded_fragment_lengths = expanded_fragment_lengths_str.string(); char unichar[UNICHAR_LEN + 1]; // Populate char_choices list such that it corresponds to search_state. // // If we are rebuilding a state that contains character fragments: // -- combine blobs that belong to character fragments // -- re-classify the blobs to obtain choices list for the merged blob // -- ensure that correct classification appears in the new choices list // NOTE: a choice composed form original fragment choices will be always // added to the new choices list for each character composed from // fragments (even if the choice for the corresponding character appears // in the re-classified choices list of for the newly merged blob). int ss_index = search_state[0]; // Which index is which? // char_choices_index refers to the finished product: there is one for each // blob/unicharset entry in the final word. // ss_index refers to the search_state, and indexes a group (chunk) of blobs // that were classified together for the best state. // old_choice_index is a copy of ss_index, and accesses the old_choices, // which correspond to chunks in the best state. old_choice_index gets // set to -1 on a fragment set, as there is no corresponding chunk in // the best state. // x and y refer to the underlying blobs and are the first and last blob // indices in a chunk. for (int char_choices_index = char_choices->length() - 1; char_choices_index >= 0; --char_choices_index) { // The start and end of the blob to rebuild. int true_x = x; int true_y = y; // The fake merged fragment choice. BLOB_CHOICE* merged_choice = NULL; // Test for and combine fragments first. int fragment_pieces = expanded_fragment_lengths[ss_index]; int old_choice_index = ss_index; if (fragment_pieces > 1) { strncpy(unichar, word_ptr, *word_lengths_ptr); unichar[*word_lengths_ptr] = '\0'; merged_choice = rebuild_fragments(unichar, expanded_fragment_lengths, old_choice_index, old_choices); old_choice_index = -1; } while (fragment_pieces > 0) { true_x = x; // Move left to the previous blob. y = x - 1; x = y - search_state[ss_index--]; --fragment_pieces; } word->best_state[char_choices_index] = true_y + 1 - true_x; BLOB_CHOICE_LIST *current_choices = join_blobs_and_classify( word, true_x, true_y, old_choice_index, ratings, old_choices); if (merged_choice != NULL) { // Insert merged_blob into current_choices, such that current_choices // are still sorted in non-descending order by rating. ASSERT_HOST(!current_choices->empty()); BLOB_CHOICE_IT choice_it(current_choices); for (choice_it.mark_cycle_pt(); !choice_it.cycled_list() && merged_choice->rating() > choice_it.data()->rating(); choice_it.forward()) choice_it.add_before_stay_put(merged_choice); } // Get rid of fragments in current_choices. BLOB_CHOICE_IT choice_it(current_choices); for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { if (getDict().getUnicharset().get_fragment( choice_it.data()->unichar_id())) { delete choice_it.extract(); } } char_choices->set(current_choices, char_choices_index); // Update word_ptr and word_lengths_ptr. if (word_lengths_ptr != NULL && word_ptr != NULL) { word_lengths_ptr--; word_ptr -= (*word_lengths_ptr); } } old_choices->delete_data_pointers(); delete old_choices; memfree(search_state); return char_choices; }
/** * @name program_editdown * * This function holds any nessessary post processing for the Wise Owl * program. */ void Wordrec::program_editdown(inT32 elasped_time) { EndAdaptiveClassifier(); getDict().End(); }
BOOL8 Tesseract::word_adaptable( //should we adapt? WERD_RES *word, uinT16 mode) { if (tessedit_adaption_debug) { tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n", word->best_choice == NULL ? "" : word->best_choice->unichar_string().string(), word->best_choice->rating(), word->best_choice->certainty()); } BOOL8 status = FALSE; BITS16 flags(mode); enum MODES { ADAPTABLE_WERD, ACCEPTABLE_WERD, CHECK_DAWGS, CHECK_SPACES, CHECK_ONE_ELL_CONFLICT, CHECK_AMBIG_WERD }; /* 0: NO adaption */ if (mode == 0) { if (tessedit_adaption_debug) tprintf("adaption disabled\n"); return FALSE; } if (flags.bit (ADAPTABLE_WERD)) { status |= word->tess_would_adapt; // result of Classify::AdaptableWord() if (tessedit_adaption_debug && !status) { tprintf("tess_would_adapt bit is false\n"); } } if (flags.bit (ACCEPTABLE_WERD)) { status |= word->tess_accepted; if (tessedit_adaption_debug && !status) { tprintf("tess_accepted bit is false\n"); } } if (!status) { // If not set then return FALSE; // ignore other checks } if (flags.bit (CHECK_DAWGS) && (word->best_choice->permuter () != SYSTEM_DAWG_PERM) && (word->best_choice->permuter () != FREQ_DAWG_PERM) && (word->best_choice->permuter () != USER_DAWG_PERM) && (word->best_choice->permuter () != NUMBER_PERM)) { if (tessedit_adaption_debug) tprintf("word not in dawgs\n"); return FALSE; } if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, FALSE)) { if (tessedit_adaption_debug) tprintf("word has ell conflict\n"); return FALSE; } if (flags.bit (CHECK_SPACES) && (strchr(word->best_choice->unichar_string().string(), ' ') != NULL)) { if (tessedit_adaption_debug) tprintf("word contains spaces\n"); return FALSE; } // if (flags.bit (CHECK_AMBIG_WERD) && test_ambig_word (word)) if (flags.bit (CHECK_AMBIG_WERD) && !getDict().NoDangerousAmbig(word->best_choice, NULL, false, NULL, NULL)) { if (tessedit_adaption_debug) tprintf("word is ambiguous\n"); return FALSE; } // Do not adapt to words that are composed from fragments if // tessedit_adapt_to_char_fragments is false. if (!tessedit_adapt_to_char_fragments) { const char *fragment_lengths = word->best_choice->fragment_lengths(); if (fragment_lengths != NULL && *fragment_lengths != '\0') { for (int i = 0; i < word->best_choice->length(); ++i) { if (fragment_lengths[i] > 1) { if (tessedit_adaption_debug) tprintf("won't adapt to fragments\n"); return false; // found a character composed from fragments } } } } if (tessedit_adaption_debug) { tprintf("returning status %d\n", status); } return status; }
void mkraddobjects() { AcGePoint3d pt; if (RTNORM != acedGetPoint( NULL, "\nEnter position:", asDblArray (pt) )) return; AsdkMkrEntity* pEnt = new AsdkMkrEntity; if (NULL == pEnt) return; pEnt->setPos( pt ); if (!append( pEnt )) { delete pEnt; return; } AcDbObjectId objId; AsdkMkrObject *pObj = new AsdkMkrObject; if (NULL == pObj) { pEnt->erase(); pEnt->close(); return; } #ifdef DIRECT acdbHostApplicationServices()->workingDatabase() ->addAcDbObject( objId, pObj ); pObj->close(); #else #ifdef NOD AcDbDictionary* pMyDict = getDict( /*NOXLATE*/"ASDK_MYDICT", AcDb::kForWrite ); if (NULL != pMyDict) pMyDict->setMergeStyle(AcDb::kDrcMangleName); #else AcDbDictionary* pMyDict = getExtDict( pEnt, /*NOXLATE*/"ASDK_MYDICT", AcDb::kForWrite ); #endif // NOD if (NULL == pMyDict) { delete pObj; pEnt->erase(); pEnt->close(); return; } Acad::ErrorStatus es; if (pMyDict->has( /*NOXLATE*/"MYENTRY" )) es = pMyDict->setAt( "*", pObj, objId ); else es = pMyDict->setAt( /*NOXLATE*/"MYENTRY", pObj, objId ); pMyDict->close(); if (Acad::eOk == es) pObj->close(); else { delete pObj; pEnt->erase(); pEnt->close(); return; } #endif // DIRECT pEnt->setId( objId ); pEnt->close(); acutPrintf( "\nEv'rything OK\n" ); }
// Apply segmentation search to the given set of words, within the constraints // of the existing ratings matrix. If there is already a best_choice on a word // leaves it untouched and just sets the done/accepted etc flags. void Tesseract::SearchWords(PointerVector<WERD_RES>* words) { // Run the segmentation search on the network outputs and make a BoxWord // for each of the output words. // If we drop a word as junk, then there is always a space in front of the // next. const Dict* stopper_dict = lstm_recognizer_->GetDict(); if (stopper_dict == nullptr) stopper_dict = &getDict(); bool any_nonspace_delimited = false; for (int w = 0; w < words->size(); ++w) { WERD_RES* word = (*words)[w]; if (word->best_choice != nullptr && word->best_choice->ContainsAnyNonSpaceDelimited()) { any_nonspace_delimited = true; break; } } for (int w = 0; w < words->size(); ++w) { WERD_RES* word = (*words)[w]; if (word->best_choice == NULL) { // If we are using the beam search, the unicharset had better match! word->SetupWordScript(unicharset); WordSearch(word); } else if (word->best_choice->unicharset() == &unicharset && !lstm_recognizer_->IsRecoding()) { // We set up the word without using the dictionary, so set the permuter // now, but we can only do it because the unicharsets match. word->best_choice->set_permuter( getDict().valid_word(*word->best_choice, true)); } if (word->best_choice == NULL) { // It is a dud. word->SetupFake(lstm_recognizer_->GetUnicharset()); } else { // Set the best state. for (int i = 0; i < word->best_choice->length(); ++i) { int length = word->best_choice->state(i); word->best_state.push_back(length); } word->reject_map.initialise(word->best_choice->length()); word->tess_failed = false; word->tess_accepted = true; word->tess_would_adapt = false; word->done = true; word->tesseract = this; float word_certainty = MIN(word->space_certainty, word->best_choice->certainty()); word_certainty *= kCertaintyScale; // Arbitrary ding factor for non-dictionary words. if (!lstm_recognizer_->IsRecoding() && !Dict::valid_word_permuter(word->best_choice->permuter(), true)) word_certainty -= kNonDictionaryPenalty; if (getDict().stopper_debug_level >= 1) { tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n", word->best_choice->certainty(), word->space_certainty, MIN(word->space_certainty, word->best_choice->certainty()) * kCertaintyScale, word_certainty); word->best_choice->print(); } // Discard words that are impossibly bad, but allow a bit more for // dictionary words, and keep bad words in non-space-delimited langs. if (word_certainty >= RecodeBeamSearch::kMinCertainty || any_nonspace_delimited || (word_certainty >= kWorstDictCertainty && Dict::valid_word_permuter(word->best_choice->permuter(), true))) { word->tess_accepted = stopper_dict->AcceptableResult(word); } else { if (getDict().stopper_debug_level >= 1) { tprintf("Deleting word with certainty %g\n", word_certainty); word->best_choice->print(); } // It is a dud. word->SetupFake(lstm_recognizer_->GetUnicharset()); } word->best_choice->set_certainty(word_certainty); } } }
/** * @name chop_word_main * * Classify the blobs in this word and permute the results. Find the * worst blob in the word and chop it up. Continue this process until * a good answer has been found or all the blobs have been chopped up * enough. Return the word level ratings. */ BLOB_CHOICE_LIST_VECTOR *Wordrec::chop_word_main(WERD_RES *word) { TBLOB *blob; int index; int did_chopping; STATE state; BLOB_CHOICE_LIST *match_result; MATRIX *ratings = NULL; DANGERR fixpt; /*dangerous ambig */ inT32 bit_count; //no of bits set_denorm(&word->denorm); BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR(); BLOB_CHOICE_LIST_VECTOR *best_char_choices = new BLOB_CHOICE_LIST_VECTOR(); did_chopping = 0; for (blob = word->chopped_word->blobs, index = 0; blob != NULL; blob = blob->next, index++) { match_result = classify_blob(blob, "chop_word:", Green); if (match_result == NULL) cprintf("Null classifier output!\n"); *char_choices += match_result; } bit_count = index - 1; set_n_ones(&state, char_choices->length() - 1); bool acceptable = false; bool replaced = false; bool best_choice_updated = getDict().permute_characters(*char_choices, word->best_choice, word->raw_choice); if (best_choice_updated && getDict().AcceptableChoice(char_choices, word->best_choice, &fixpt, CHOPPER_CALLER, &replaced)) { acceptable = true; } if (replaced) update_blob_classifications(word->chopped_word, *char_choices); CopyCharChoices(*char_choices, best_char_choices); if (!acceptable) { // do more work to find a better choice did_chopping = 1; bool best_choice_acceptable = false; if (chop_enable) improve_by_chopping(word, char_choices, &state, best_char_choices, &fixpt, &best_choice_acceptable); if (chop_debug) print_seams ("Final seam list:", word->seam_array); // The force_word_assoc is almost redundant to enable_assoc. However, // it is not conditioned on the dict behavior. For CJK, we need to force // the associator to be invoked. When we figure out the exact behavior // of dict on CJK, we can remove the flag if it turns out to be redundant. if ((wordrec_enable_assoc && !best_choice_acceptable) || force_word_assoc) { ratings = word_associator(word, &state, best_char_choices, &fixpt, &state); } } best_char_choices = rebuild_current_state(word, &state, best_char_choices, ratings); if (ratings != NULL) { if (wordrec_debug_level > 0) { tprintf("Final Ratings Matrix:\n"); ratings->print(getDict().getUnicharset()); } ratings->delete_matrix_pointers(); delete ratings; } getDict().FilterWordChoices(); char_choices->delete_data_pointers(); delete char_choices; return best_char_choices; }
/********************************************************************** * tess_acceptable_word * * Return true if the word is regarded as "good enough". **********************************************************************/ BOOL8 Tesseract::tess_acceptable_word( WERD_CHOICE *word_choice, // after context WERD_CHOICE *raw_choice) { // before context return getDict().AcceptableResult(*word_choice, *raw_choice); }
/** * @name program_editdown * * This function holds any nessessary post processing for the Wise Owl * program. */ void Wordrec::program_editdown(inT32 elasped_time) { EndAdaptiveClassifier(); blob_match_table.end_match_table(); getDict().InitChoiceAccum(); getDict().End(); }
Wordrec::Wordrec() : // control parameters BOOL_MEMBER(merge_fragments_in_matrix, TRUE, "Merge the fragments in the ratings matrix and delete them" " after merging", params()), BOOL_MEMBER(wordrec_no_block, FALSE, "Don't output block information", params()), BOOL_MEMBER(wordrec_enable_assoc, TRUE, "Associator Enable", params()), BOOL_MEMBER(force_word_assoc, FALSE, "force associator to run regardless of what enable_assoc is." "This is used for CJK where component grouping is necessary.", CCUtil::params()), INT_MEMBER(wordrec_num_seg_states, 30, "Segmentation states", CCUtil::params()), double_MEMBER(wordrec_worst_state, 1.0, "Worst segmentation state", params()), BOOL_MEMBER(fragments_guide_chopper, FALSE, "Use information from fragments to guide chopping process", params()), INT_MEMBER(repair_unchopped_blobs, 1, "Fix blobs that aren't chopped", params()), double_MEMBER(tessedit_certainty_threshold, -2.25, "Good blob limit", params()), INT_MEMBER(chop_debug, 0, "Chop debug", params()), BOOL_MEMBER(chop_enable, 1, "Chop enable", params()), BOOL_MEMBER(chop_vertical_creep, 0, "Vertical creep", params()), INT_MEMBER(chop_split_length, 10000, "Split Length", params()), INT_MEMBER(chop_same_distance, 2, "Same distance", params()), INT_MEMBER(chop_min_outline_points, 6, "Min Number of Points on Outline", params()), INT_MEMBER(chop_inside_angle, -50, "Min Inside Angle Bend", params()), INT_MEMBER(chop_min_outline_area, 2000, "Min Outline Area", params()), double_MEMBER(chop_split_dist_knob, 0.5, "Split length adjustment", params()), double_MEMBER(chop_overlap_knob, 0.9, "Split overlap adjustment", params()), double_MEMBER(chop_center_knob, 0.15, "Split center adjustment", params()), double_MEMBER(chop_sharpness_knob, 0.06, "Split sharpness adjustment", params()), double_MEMBER(chop_width_change_knob, 5.0, "Width change adjustment", params()), double_MEMBER(chop_ok_split, 100.0, "OK split limit", params()), double_MEMBER(chop_good_split, 50.0, "Good split limit", params()), INT_MEMBER(chop_x_y_weight, 3, "X / Y length weight", params()), INT_MEMBER(segment_adjust_debug, 0, "Segmentation adjustment debug", params()), BOOL_MEMBER(assume_fixed_pitch_char_segment, FALSE, "include fixed-pitch heuristics in char segmentation", params()), BOOL_MEMBER(use_new_state_cost, FALSE, "use new state cost heuristics for segmentation state evaluation", params()), double_MEMBER(heuristic_segcost_rating_base, 1.25, "base factor for adding segmentation cost into word rating." "It's a multiplying factor, the larger the value above 1, " "the bigger the effect of segmentation cost.", params()), double_MEMBER(heuristic_weight_rating, 1.0, "weight associated with char rating in combined cost of state", params()), double_MEMBER(heuristic_weight_width, 1000.0, "weight associated with width evidence in combined cost of" " state", params()), double_MEMBER(heuristic_weight_seamcut, 0.0, "weight associated with seam cut in combined cost of state", params()), double_MEMBER(heuristic_max_char_wh_ratio, 2.0, "max char width-to-height ratio allowed in segmentation", params()), INT_MEMBER(wordrec_debug_level, 0, "Debug level for wordrec", params()), BOOL_MEMBER(wordrec_debug_blamer, false, "Print blamer debug messages", params()), BOOL_MEMBER(wordrec_run_blamer, false, "Try to set the blame for errors", params()), BOOL_MEMBER(enable_new_segsearch, true, "Enable new segmentation search path.", params()), INT_MEMBER(segsearch_debug_level, 0, "SegSearch debug level", params()), INT_MEMBER(segsearch_max_pain_points, 2000, "Maximum number of pain points stored in the queue", params()), INT_MEMBER(segsearch_max_futile_classifications, 10, "Maximum number of pain point classifications per word that" "did not result in finding a better word choice.", params()), double_MEMBER(segsearch_max_char_wh_ratio, 2.0, "Maximum character width-to-height ratio", params()), double_MEMBER(segsearch_max_fixed_pitch_char_wh_ratio, 2.0, "Maximum character width-to-height ratio for" " fixed-pitch fonts", params()), BOOL_MEMBER(save_alt_choices, false, "Save alternative paths found during chopping" " and segmentation search", params()) { prev_word_best_choice_ = NULL; language_model_ = new LanguageModel(&get_fontinfo_table(), &(getDict())); pass2_seg_states = 0; num_joints = 0; num_pushed = 0; num_popped = 0; fill_lattice_ = NULL; }
/** * @name evaluate_state * * Evaluate the segmentation that is represented by this state in the * best first search. Add this state to the "states_seen" list. */ inT16 Wordrec::evaluate_state(CHUNKS_RECORD *chunks_record, SEARCH_RECORD *the_search, DANGERR *fixpt, BlamerBundle *blamer_bundle) { BLOB_CHOICE_LIST_VECTOR *char_choices; SEARCH_STATE chunk_groups; float rating_limit = the_search->best_choice->rating(); bool keep_going = true; PIECES_STATE widths; the_search->num_states++; chunk_groups = bin_to_chunks(the_search->this_state, the_search->num_joints); bin_to_pieces (the_search->this_state, the_search->num_joints, widths); if (wordrec_debug_level > 1) { log_state("Evaluating state", the_search->num_joints, the_search->this_state); } getDict().LogNewSegmentation(widths); char_choices = evaluate_chunks(chunks_record, chunk_groups, blamer_bundle); getDict().SetWordsegRatingAdjustFactor(-1.0f); bool updated_best_choice = false; if (char_choices != NULL && char_choices->length() > 0) { // Compute the segmentation cost and include the cost in word rating. // TODO(dsl): We should change the SEARCH_RECORD to store this cost // from state evaluation and avoid recomputing it here. prioritize_state(chunks_record, the_search); getDict().SetWordsegRatingAdjustFactor(the_search->segcost_bias); updated_best_choice = getDict().permute_characters(*char_choices, the_search->best_choice, the_search->raw_choice); bool replaced = false; if (updated_best_choice) { if (getDict().AcceptableChoice(char_choices, the_search->best_choice, NULL, ASSOCIATOR_CALLER, &replaced)) { keep_going = false; } CopyCharChoices(*char_choices, the_search->best_char_choices); } } getDict().SetWordsegRatingAdjustFactor(-1.0f); #ifndef GRAPHICS_DISABLED if (wordrec_display_segmentations) { display_segmentation (chunks_record->chunks, chunk_groups); if (wordrec_display_segmentations > 1) window_wait(segm_window); } #endif if (rating_limit != the_search->best_choice->rating()) { ASSERT_HOST(updated_best_choice); the_search->before_best = the_search->num_states; the_search->best_state->part1 = the_search->this_state->part1; the_search->best_state->part2 = the_search->this_state->part2; replace_char_widths(chunks_record, chunk_groups); } else { ASSERT_HOST(!updated_best_choice); if (char_choices != NULL) fixpt->clear(); } if (char_choices != NULL) delete char_choices; memfree(chunk_groups); return (keep_going); }
void Wordrec::SegSearch(CHUNKS_RECORD *chunks_record, WERD_CHOICE *best_choice, BLOB_CHOICE_LIST_VECTOR *best_char_choices, WERD_CHOICE *raw_choice, STATE *output_best_state) { int row, col = 0; if (segsearch_debug_level > 0) { tprintf("Starting SegSearch on ratings matrix:\n"); chunks_record->ratings->print(getDict().getUnicharset()); } // Start with a fresh best_choice since rating adjustments // used by the chopper and the new segmentation search are not compatible. best_choice->set_rating(WERD_CHOICE::kBadRating); // Clear best choice accumulator (that is used for adaption), so that // choices adjusted by chopper do not interfere with the results from the // segmentation search. getDict().ClearBestChoiceAccum(); MATRIX *ratings = chunks_record->ratings; // Priority queue containing pain points generated by the language model // The priority is set by the language model components, adjustments like // seam cost and width priority are factored into the priority. HEAP *pain_points = MakeHeap(segsearch_max_pain_points); // best_path_by_column records the lowest cost path found so far for each // column of the chunks_record->ratings matrix over all the rows. BestPathByColumn *best_path_by_column = new BestPathByColumn[ratings->dimension()]; for (col = 0; col < ratings->dimension(); ++col) { best_path_by_column[col].avg_cost = WERD_CHOICE::kBadRating; best_path_by_column[col].best_vse = NULL; } language_model_->InitForWord(prev_word_best_choice_, &denorm_, assume_fixed_pitch_char_segment, best_choice->certainty(), segsearch_max_char_wh_ratio, pain_points, chunks_record); MATRIX_COORD *pain_point; float pain_point_priority; BestChoiceBundle best_choice_bundle( output_best_state, best_choice, raw_choice, best_char_choices); // pending[i] stores a list of the parent/child pair of BLOB_CHOICE_LISTs, // where i is the column of the child. Initially all the classified entries // in the ratings matrix from column 0 (with parent NULL) are inserted into // pending[0]. As the language model state is updated, new child/parent // pairs are inserted into the lists. Next, the entries in pending[1] are // considered, and so on. It is important that during the update the // children are considered in the non-decreasing order of their column, since // this guarantess that all the parents would be up to date before an update // of a child is done. SEG_SEARCH_PENDING_LIST *pending = new SEG_SEARCH_PENDING_LIST[ratings->dimension()]; // Search for the ratings matrix for the initial best path. for (row = 0; row < ratings->dimension(); ++row) { if (ratings->get(0, row) != NOT_CLASSIFIED) { pending[0].add_sorted( SEG_SEARCH_PENDING::compare, true, new SEG_SEARCH_PENDING(row, NULL, LanguageModel::kAllChangedFlag)); } } UpdateSegSearchNodes(0, &pending, &best_path_by_column, chunks_record, pain_points, &best_choice_bundle); // Keep trying to find a better path by fixing the "pain points". int num_futile_classifications = 0; while (!(language_model_->AcceptableChoiceFound() || num_futile_classifications >= segsearch_max_futile_classifications)) { // Get the next valid "pain point". int pop; while (true) { pop = HeapPop(pain_points, &pain_point_priority, &pain_point); if (pop == EMPTY) break; if (pain_point->Valid(*ratings) && ratings->get(pain_point->col, pain_point->row) == NOT_CLASSIFIED) { break; } else { delete pain_point; } } if (pop == EMPTY) { if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n"); break; } if (segsearch_debug_level > 0) { tprintf("Classifying pain point priority=%.4f, col=%d, row=%d\n", pain_point_priority, pain_point->col, pain_point->row); } BLOB_CHOICE_LIST *classified = classify_piece( chunks_record->chunks, chunks_record->splits, pain_point->col, pain_point->row); ratings->put(pain_point->col, pain_point->row, classified); if (segsearch_debug_level > 0) { print_ratings_list("Updated ratings matrix with a new entry:", ratings->get(pain_point->col, pain_point->row), getDict().getUnicharset()); chunks_record->ratings->print(getDict().getUnicharset()); } // Insert initial "pain points" to join the newly classified blob // with its left and right neighbors. if (!classified->empty()) { float worst_piece_cert; bool fragmented; if (pain_point->col > 0) { language_model_->GetWorstPieceCertainty( pain_point->col-1, pain_point->row, chunks_record->ratings, &worst_piece_cert, &fragmented); language_model_->GeneratePainPoint( pain_point->col-1, pain_point->row, false, LanguageModel::kInitialPainPointPriorityAdjustment, worst_piece_cert, fragmented, best_choice->certainty(), segsearch_max_char_wh_ratio, NULL, NULL, chunks_record, pain_points); } if (pain_point->row+1 < ratings->dimension()) { language_model_->GetWorstPieceCertainty( pain_point->col, pain_point->row+1, chunks_record->ratings, &worst_piece_cert, &fragmented); language_model_->GeneratePainPoint( pain_point->col, pain_point->row+1, true, LanguageModel::kInitialPainPointPriorityAdjustment, worst_piece_cert, fragmented, best_choice->certainty(), segsearch_max_char_wh_ratio, NULL, NULL, chunks_record, pain_points); } } // Record a pending entry with the pain_point and each of its parents. int parent_row = pain_point->col - 1; if (parent_row < 0) { // this node has no parents pending[pain_point->col].add_sorted( SEG_SEARCH_PENDING::compare, true, new SEG_SEARCH_PENDING(pain_point->row, NULL, LanguageModel::kAllChangedFlag)); } else { for (int parent_col = 0; parent_col < pain_point->col; ++parent_col) { if (ratings->get(parent_col, parent_row) != NOT_CLASSIFIED) { pending[pain_point->col].add_sorted( SEG_SEARCH_PENDING::compare, true, new SEG_SEARCH_PENDING(pain_point->row, ratings->get(parent_col, parent_row), LanguageModel::kAllChangedFlag)); } } } UpdateSegSearchNodes(pain_point->col, &pending, &best_path_by_column, chunks_record, pain_points, &best_choice_bundle); if (!best_choice_bundle.updated) ++num_futile_classifications; if (segsearch_debug_level > 0) { tprintf("num_futile_classifications %d\n", num_futile_classifications); } // Clean up best_choice_bundle.updated = false; delete pain_point; // done using this pain point } if (segsearch_debug_level > 0) { tprintf("Done with SegSearch (AcceptableChoiceFound: %d\n", language_model_->AcceptableChoiceFound()); } // Clean up. FreeHeapData(pain_points, MATRIX_COORD::Delete); delete[] best_path_by_column; delete[] pending; for (row = 0; row < ratings->dimension(); ++row) { for (col = 0; col <= row; ++col) { BLOB_CHOICE_LIST *rating = ratings->get(col, row); if (rating != NOT_CLASSIFIED) language_model_->DeleteState(rating); } } }
/************************************************************************* * write_results() * * All recognition and rejection has now been done. Generate the following: * .txt file - giving the final best choices with NO highlighting * .raw file - giving the tesseract top choice output for each word * .map file - showing how the .txt file has been rejected in the .ep file * epchoice list - a list of one element per word, containing the text for the * epaper. Reject strings are inserted. * inset list - a list of bounding boxes of reject insets - indexed by the * reject strings in the epchoice text. *************************************************************************/ void Tesseract::write_results(PAGE_RES_IT &page_res_it, char newline_type, // type of newline BOOL8 force_eol) { // override tilde crunch? WERD_RES *word = page_res_it.word(); STRING repetition_code; const STRING *wordstr; STRING wordstr_lengths; int i; char unrecognised = STRING (unrecognised_char)[0]; char ep_chars[32]; //Only for unlv_tilde_crunch int ep_chars_index = 0; char txt_chs[32]; //Only for unlv_tilde_crunch char map_chs[32]; //Only for unlv_tilde_crunch int txt_index = 0; BOOL8 need_reject = FALSE; UNICHAR_ID space = unicharset.unichar_to_id(" "); if ((word->unlv_crunch_mode != CR_NONE || word->best_choice->length() == 0) && !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) { if ((word->unlv_crunch_mode != CR_DELETE) && (!stats_.tilde_crunch_written || ((word->unlv_crunch_mode == CR_KEEP_SPACE) && (word->word->space () > 0) && !word->word->flag (W_FUZZY_NON) && !word->word->flag (W_FUZZY_SP)))) { if (!word->word->flag (W_BOL) && (word->word->space () > 0) && !word->word->flag (W_FUZZY_NON) && !word->word->flag (W_FUZZY_SP)) { // Write a space to separate from preceeding good text. txt_chs[txt_index] = ' '; map_chs[txt_index++] = '1'; ep_chars[ep_chars_index++] = ' '; stats_.last_char_was_tilde = false; } need_reject = TRUE; } if ((need_reject && !stats_.last_char_was_tilde) || (force_eol && stats_.write_results_empty_block)) { /* Write a reject char - mark as rejected unless zero_rejection mode */ stats_.last_char_was_tilde = TRUE; txt_chs[txt_index] = unrecognised; if (tessedit_zero_rejection || (suspect_level == 0)) { map_chs[txt_index++] = '1'; ep_chars[ep_chars_index++] = unrecognised; } else { map_chs[txt_index++] = '0'; /* The ep_choice string is a faked reject to allow newdiff to sync the .etx with the .txt and .map files. */ ep_chars[ep_chars_index++] = CTRL_INSET; // escape code //dummy reject ep_chars[ep_chars_index++] = 1; //dummy reject ep_chars[ep_chars_index++] = 1; //type ep_chars[ep_chars_index++] = 2; //dummy reject ep_chars[ep_chars_index++] = 1; //dummy reject ep_chars[ep_chars_index++] = 1; } stats_.tilde_crunch_written = true; stats_.last_char_was_newline = false; stats_.write_results_empty_block = false; } if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) { /* Add a new line output */ txt_chs[txt_index] = '\n'; map_chs[txt_index++] = '\n'; //end line ep_chars[ep_chars_index++] = newline_type; //Cos of the real newline stats_.tilde_crunch_written = false; stats_.last_char_was_newline = true; stats_.last_char_was_tilde = false; } txt_chs[txt_index] = '\0'; map_chs[txt_index] = '\0'; ep_chars[ep_chars_index] = '\0'; // terminate string word->ep_choice = new WERD_CHOICE(ep_chars, unicharset); if (force_eol) stats_.write_results_empty_block = true; return; } /* NORMAL PROCESSING of non tilde crunched words */ stats_.tilde_crunch_written = false; if (newline_type) stats_.last_char_was_newline = true; else stats_.last_char_was_newline = false; stats_.write_results_empty_block = force_eol; // about to write a real word if (unlv_tilde_crunching && stats_.last_char_was_tilde && (word->word->space() == 0) && !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) && (word->best_choice->unichar_id(0) == space)) { /* Prevent adjacent tilde across words - we know that adjacent tildes within words have been removed */ word->best_choice->remove_unichar_id(0); if (word->best_choice->blob_choices() != NULL) { BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices()); if (!blob_choices_it.empty()) delete blob_choices_it.extract(); } word->best_choice->populate_unichars(getDict().getUnicharset()); word->reject_map.remove_pos (0); delete word->box_word; word->box_word = new BoxWord; } if (newline_type || (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) stats_.last_char_was_tilde = false; else { if (word->reject_map.length () > 0) { if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) stats_.last_char_was_tilde = true; else stats_.last_char_was_tilde = false; } else if (word->word->space () > 0) stats_.last_char_was_tilde = false; /* else it is unchanged as there are no output chars */ } ASSERT_HOST (word->best_choice->length() == word->reject_map.length()); set_unlv_suspects(word); check_debug_pt (word, 120); if (tessedit_rejection_debug) { tprintf ("Dict word: \"%s\": %d\n", word->best_choice->debug_string(unicharset).string(), dict_word(*(word->best_choice))); } if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) { repetition_code = "|^~R"; wordstr_lengths = "\001\001\001\001"; repetition_code += unicharset.id_to_unichar(get_rep_char (word)); wordstr_lengths += strlen(unicharset.id_to_unichar(get_rep_char (word))); wordstr = &repetition_code; } else { if (tessedit_zero_rejection) { /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ for (i = 0; i < word->best_choice->length(); ++i) { if (word->reject_map[i].rejected()) word->reject_map[i].setrej_minimal_rej_accept(); } } if (tessedit_minimal_rejection) { /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ for (i = 0; i < word->best_choice->length(); ++i) { if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected()) word->reject_map[i].setrej_minimal_rej_accept(); } } } }
/// Recursive helper to find a match to the target_text (from text_index /// position) in the choices (from choices_pos position). /// @param choices is an array of GenericVectors, of length choices_length, /// with each element representing a starting position in the word, and the /// #GenericVector holding classification results for a sequence of consecutive /// blobs, with index 0 being a single blob, index 1 being 2 blobs etc. /// @param choices_pos /// @param choices_length /// @param target_text /// @param text_index /// @param rating /// @param segmentation /// @param best_rating /// @param best_segmentation void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices, int choices_pos, int choices_length, const GenericVector<UNICHAR_ID>& target_text, int text_index, float rating, GenericVector<int>* segmentation, float* best_rating, GenericVector<int>* best_segmentation) { const UnicharAmbigsVector& table = getDict().getUnicharAmbigs().dang_ambigs(); for (int length = 1; length <= choices[choices_pos].size(); ++length) { // Rating of matching choice or worst choice if no match. float choice_rating = 0.0f; // Find the corresponding best BLOB_CHOICE. BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]); for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { BLOB_CHOICE* choice = choice_it.data(); choice_rating = choice->rating(); UNICHAR_ID class_id = choice->unichar_id(); if (class_id == target_text[text_index]) { break; } // Search ambigs table. if (class_id < table.size() && table[class_id] != NULL) { AmbigSpec_IT spec_it(table[class_id]); for (spec_it.mark_cycle_pt(); !spec_it.cycled_list(); spec_it.forward()) { const AmbigSpec *ambig_spec = spec_it.data(); // We'll only do 1-1. if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID && ambig_spec->correct_ngram_id == target_text[text_index]) break; } if (!spec_it.cycled_list()) break; // Found an ambig. } } if (choice_it.cycled_list()) continue; // No match. segmentation->push_back(length); if (choices_pos + length == choices_length && text_index + 1 == target_text.size()) { // This is a complete match. If the rating is good record a new best. if (applybox_debug > 2) { tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n", rating + choice_rating, *best_rating, segmentation->size(), best_segmentation->size()); } if (best_segmentation->empty() || rating + choice_rating < *best_rating) { *best_segmentation = *segmentation; *best_rating = rating + choice_rating; } } else if (choices_pos + length < choices_length && text_index + 1 < target_text.size()) { if (applybox_debug > 3) { tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n", target_text[text_index], unicharset.id_to_unichar(target_text[text_index]), choice_it.data()->unichar_id() == target_text[text_index] ? "Match" : "Ambig", choices_pos, length); } SearchForText(choices, choices_pos + length, choices_length, target_text, text_index + 1, rating + choice_rating, segmentation, best_rating, best_segmentation); if (applybox_debug > 3) { tprintf("End recursion for %d=%s\n", target_text[text_index], unicharset.id_to_unichar(target_text[text_index])); } } segmentation->truncate(segmentation->size() - 1); } }
/** * @name tess_add_doc_word * * Add the given word to the document dictionary */ void Tesseract::tess_add_doc_word(WERD_CHOICE *word_choice) { getDict().add_document_word(*word_choice); }
/** * rebuild_current_state * * Evaluate the segmentation that is represented by this state in the * best first search. Add this state to the "states_seen" list. */ BLOB_CHOICE_LIST_VECTOR *Wordrec::rebuild_current_state( TBLOB *blobs, SEAMS seam_list, STATE *state, BLOB_CHOICE_LIST_VECTOR *old_choices, int fx, bool force_rebuild, const WERD_CHOICE &best_choice, const MATRIX *ratings) { // Initialize search_state, num_joints, x, y. int num_joints = array_count(seam_list); #ifndef GRAPHICS_DISABLED if (wordrec_display_segmentations) { print_state("Rebuiling state", state, num_joints); } #endif SEARCH_STATE search_state = bin_to_chunks(state, num_joints); int x = 0; int y; int i; for (i = 1; i <= search_state[0]; i++) { y = x + search_state[i]; x = y + 1; } y = count_blobs (blobs) - 1; // Initialize char_choices, expanded_fragment_lengths: // e.g. if fragment_lengths = {1 1 2 3 1}, // expanded_fragment_lengths_str = {1 1 2 2 3 3 3 1}. BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR(); STRING expanded_fragment_lengths_str = ""; bool state_has_fragments = false; const char *fragment_lengths = NULL; if (best_choice.length() > 0) { fragment_lengths = best_choice.fragment_lengths(); } if (fragment_lengths) { for (int i = 0; i < best_choice.length(); ++i) { *char_choices += NULL; if (fragment_lengths[i] > 1) { state_has_fragments = true; } for (int j = 0; j < fragment_lengths[i]; ++j) { expanded_fragment_lengths_str += fragment_lengths[i]; } } } else { for (i = 0; i <= search_state[0]; ++i) { expanded_fragment_lengths_str += (char)1; *char_choices += NULL; } } // Finish early if force_rebuld is false and there are no fragments to merge. if (!force_rebuild && !state_has_fragments) { delete char_choices; memfree(search_state); return old_choices; } // Set up variables for concatenating fragments. const char *word_lengths_ptr = NULL; const char *word_ptr = NULL; if (state_has_fragments) { // Make word_lengths_ptr point to the last element in // best_choice->unichar_lengths(). word_lengths_ptr = best_choice.unichar_lengths().string(); word_lengths_ptr += (strlen(word_lengths_ptr)-1); // Make word_str point to the beginning of the last // unichar in best_choice->unichar_string(). word_ptr = best_choice.unichar_string().string(); word_ptr += (strlen(word_ptr)-*word_lengths_ptr); } const char *expanded_fragment_lengths = expanded_fragment_lengths_str.string(); bool merging_fragment = false; int true_y = -1; char unichar[UNICHAR_LEN + 1]; int fragment_pieces = -1; float rating = 0.0; float certainty = -MAX_FLOAT32; // Populate char_choices list such that it corresponds to search_state. // // If we are rebuilding a state that contains character fragments: // -- combine blobs that belong to character fragments // -- re-classify the blobs to obtain choices list for the merged blob // -- ensure that correct classification appears in the new choices list // NOTE: a choice composed form original fragment choices will be always // added to the new choices list for each character composed from // fragments (even if the choice for the corresponding character appears // in the re-classified choices list of for the newly merged blob). BLOB_CHOICE_IT temp_it; int char_choices_index = char_choices->length() - 1; for (i = search_state[0]; i >= 0; i--) { BLOB_CHOICE_LIST *current_choices = join_blobs_and_classify( blobs, seam_list, x, y, fx, ratings, old_choices); // Combine character fragments. if (expanded_fragment_lengths[i] > 1) { // Start merging character fragments. if (!merging_fragment) { merging_fragment = true; true_y = y; fragment_pieces = expanded_fragment_lengths[i]; rating = 0.0; certainty = -MAX_FLOAT32; strncpy(unichar, word_ptr, *word_lengths_ptr); unichar[*word_lengths_ptr] = '\0'; } // Take into account the fact that we could have joined pieces // since we first recorded the ending point of a fragment (true_y). true_y -= y - x; // Populate fragment with updated values and look for the // fragment with the same values in current_choices. // Update rating and certainty of the character being composed. fragment_pieces--; CHAR_FRAGMENT fragment; fragment.set_all(unichar, fragment_pieces, expanded_fragment_lengths[i]); temp_it.set_to_list(current_choices); for (temp_it.mark_cycle_pt(); !temp_it.cycled_list(); temp_it.forward()) { const CHAR_FRAGMENT *current_fragment = getDict().getUnicharset().get_fragment(temp_it.data()->unichar_id()); if (current_fragment && fragment.equals(current_fragment)) { rating += temp_it.data()->rating(); if (temp_it.data()->certainty() > certainty) { certainty = temp_it.data()->certainty(); } break; } } assert(!temp_it.cycled_list()); // make sure we found the fragment // Free current_choices for the fragmented character. delete current_choices; // Finish composing character from fragments. if (fragment_pieces == 0) { // Populate current_choices with the classification of // the blob merged from blobs of each character fragment. current_choices = join_blobs_and_classify(blobs, seam_list, x, true_y, fx, ratings, NULL); BLOB_CHOICE *merged_choice = new BLOB_CHOICE(getDict().getUnicharset().unichar_to_id(unichar), rating, certainty, 0, NO_PERM); // Insert merged_blob into current_choices, such that current_choices // are still sorted in non-descending order by rating. ASSERT_HOST(!current_choices->empty()); temp_it.set_to_list(current_choices); for (temp_it.mark_cycle_pt(); !temp_it.cycled_list() && merged_choice->rating() > temp_it.data()->rating(); temp_it.forward()); temp_it.add_before_stay_put(merged_choice); // Done merging this fragmented character. merging_fragment = false; } } if (!merging_fragment) { // Get rid of fragments in current_choices. temp_it.set_to_list(current_choices); for (temp_it.mark_cycle_pt(); !temp_it.cycled_list(); temp_it.forward()) { if (getDict().getUnicharset().get_fragment( temp_it.data()->unichar_id())) { delete temp_it.extract(); } } char_choices->set(current_choices, char_choices_index); char_choices_index--; // Update word_ptr and word_lengths_ptr. if (word_lengths_ptr != NULL && word_ptr != NULL) { word_lengths_ptr--; word_ptr -= (*word_lengths_ptr); } } y = x - 1; x = y - search_state[i]; } old_choices->delete_data_pointers(); delete old_choices; memfree(search_state); return (char_choices); }
/********************************************************************** * select_blob_to_split * * These are the results of the last classification. Find a likely * place to apply splits. If none, return -1. **********************************************************************/ int Wordrec::select_blob_to_split( const GenericVector<BLOB_CHOICE*>& blob_choices, float rating_ceiling, bool split_next_to_fragment) { BLOB_CHOICE *blob_choice; int x; float worst = -MAX_FLOAT32; int worst_index = -1; float worst_near_fragment = -MAX_FLOAT32; int worst_index_near_fragment = -1; const CHAR_FRAGMENT **fragments = NULL; if (chop_debug) { if (rating_ceiling < MAX_FLOAT32) tprintf("rating_ceiling = %8.4f\n", rating_ceiling); else tprintf("rating_ceiling = No Limit\n"); } if (split_next_to_fragment && blob_choices.size() > 0) { fragments = new const CHAR_FRAGMENT *[blob_choices.length()]; if (blob_choices[0] != NULL) { fragments[0] = getDict().getUnicharset().get_fragment( blob_choices[0]->unichar_id()); } else { fragments[0] = NULL; } } for (x = 0; x < blob_choices.size(); ++x) { if (blob_choices[x] == NULL) { if (fragments != NULL) { delete[] fragments; } return x; } else { blob_choice = blob_choices[x]; // Populate fragments for the following position. if (split_next_to_fragment && x+1 < blob_choices.size()) { if (blob_choices[x + 1] != NULL) { fragments[x + 1] = getDict().getUnicharset().get_fragment( blob_choices[x + 1]->unichar_id()); } else { fragments[x + 1] = NULL; } } if (blob_choice->rating() < rating_ceiling && blob_choice->certainty() < tessedit_certainty_threshold) { // Update worst and worst_index. if (blob_choice->rating() > worst) { worst_index = x; worst = blob_choice->rating(); } if (split_next_to_fragment) { // Update worst_near_fragment and worst_index_near_fragment. bool expand_following_fragment = (x + 1 < blob_choices.size() && fragments[x+1] != NULL && !fragments[x+1]->is_beginning()); bool expand_preceding_fragment = (x > 0 && fragments[x-1] != NULL && !fragments[x-1]->is_ending()); if ((expand_following_fragment || expand_preceding_fragment) && blob_choice->rating() > worst_near_fragment) { worst_index_near_fragment = x; worst_near_fragment = blob_choice->rating(); if (chop_debug) { tprintf("worst_index_near_fragment=%d" " expand_following_fragment=%d" " expand_preceding_fragment=%d\n", worst_index_near_fragment, expand_following_fragment, expand_preceding_fragment); } } } } } } if (fragments != NULL) { delete[] fragments; } // TODO(daria): maybe a threshold of badness for // worst_near_fragment would be useful. return worst_index_near_fragment != -1 ? worst_index_near_fragment : worst_index; }
void Wordrec::SegSearch(WERD_RES* word_res, BestChoiceBundle* best_choice_bundle, BlamerBundle* blamer_bundle) { LMPainPoints pain_points(segsearch_max_pain_points, segsearch_max_char_wh_ratio, assume_fixed_pitch_char_segment, &getDict(), segsearch_debug_level); // Compute scaling factor that will help us recover blob outline length // from classifier rating and certainty for the blob. float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale; GenericVector<SegSearchPending> pending; InitialSegSearch(word_res, &pain_points, &pending, best_choice_bundle, blamer_bundle); if (!SegSearchDone(0)) { // find a better choice if (chop_enable && word_res->chopped_word != NULL) { improve_by_chopping(rating_cert_scale, word_res, best_choice_bundle, blamer_bundle, &pain_points, &pending); } if (chop_debug) SEAM::PrintSeams("Final seam list:", word_res->seam_array); if (blamer_bundle != NULL && !blamer_bundle->ChoiceIsCorrect(word_res->best_choice)) { blamer_bundle->SetChopperBlame(word_res, wordrec_debug_blamer); } } // Keep trying to find a better path by fixing the "pain points". MATRIX_COORD pain_point; float pain_point_priority; int num_futile_classifications = 0; STRING blamer_debug; while (wordrec_enable_assoc && (!SegSearchDone(num_futile_classifications) || (blamer_bundle != NULL && blamer_bundle->GuidedSegsearchStillGoing()))) { // Get the next valid "pain point". bool found_nothing = true; LMPainPointsType pp_type; while ((pp_type = pain_points.Deque(&pain_point, &pain_point_priority)) != LM_PPTYPE_NUM) { if (!pain_point.Valid(*word_res->ratings)) { word_res->ratings->IncreaseBandSize( pain_point.row - pain_point.col + 1); } if (pain_point.Valid(*word_res->ratings) && !word_res->ratings->Classified(pain_point.col, pain_point.row, getDict().WildcardID())) { found_nothing = false; break; } } if (found_nothing) { if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n"); break; } ProcessSegSearchPainPoint(pain_point_priority, pain_point, LMPainPoints::PainPointDescription(pp_type), &pending, word_res, &pain_points, blamer_bundle); UpdateSegSearchNodes(rating_cert_scale, pain_point.col, &pending, word_res, &pain_points, best_choice_bundle, blamer_bundle); if (!best_choice_bundle->updated) ++num_futile_classifications; if (segsearch_debug_level > 0) { tprintf("num_futile_classifications %d\n", num_futile_classifications); } best_choice_bundle->updated = false; // reset updated // See if it's time to terminate SegSearch or time for starting a guided // search for the true path to find the blame for the incorrect best_choice. if (SegSearchDone(num_futile_classifications) && blamer_bundle != NULL && blamer_bundle->GuidedSegsearchNeeded(word_res->best_choice)) { InitBlamerForSegSearch(word_res, &pain_points, blamer_bundle, &blamer_debug); } } // end while loop exploring alternative paths if (blamer_bundle != NULL) { blamer_bundle->FinishSegSearch(word_res->best_choice, wordrec_debug_blamer, &blamer_debug); } if (segsearch_debug_level > 0) { tprintf("Done with SegSearch (AcceptableChoiceFound: %d)\n", language_model_->AcceptableChoiceFound()); } }