/** * @name fix_sp_fp_word() * Test the current word to see if it can be split by deleting noise blobs. If * so, do the business. * Return with the iterator pointing to the same place if the word is unchanged, * or the last of the replacement words. */ void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK* block) { WERD_RES *word_res; WERD_RES_LIST sub_word_list; WERD_RES_IT sub_word_list_it(&sub_word_list); inT16 blob_index; inT16 new_length; float junk; word_res = word_res_it.data(); if (word_res->word->flag(W_REP_CHAR) || word_res->combination || word_res->part_of_combo || !word_res->word->flag(W_DONT_CHOP)) return; blob_index = worst_noise_blob(word_res, &junk); if (blob_index < 0) return; if (debug_fix_space_level > 1) { tprintf("FP fixspace working on \"%s\"\n", word_res->best_choice->unichar_string().string()); } word_res->word->rej_cblob_list()->sort(c_blob_comparator); sub_word_list_it.add_after_stay_put(word_res_it.extract()); fix_noisy_space_list(sub_word_list, row, block); new_length = sub_word_list.length(); word_res_it.add_list_before(&sub_word_list); for (; !word_res_it.at_last() && new_length > 1; new_length--) { word_res_it.forward(); } }
/************************************************************************* * fix_sp_fp_word() * Test the current word to see if it can be split by deleting noise blobs. If * so, do the buisiness. * Return with the iterator pointing to the same place if the word is unchanged, * or the last of the replacement words. *************************************************************************/ void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row) { WERD_RES *word_res; WERD_RES_LIST sub_word_list; WERD_RES_IT sub_word_list_it(&sub_word_list); inT16 blob_index; inT16 new_length; float junk; word_res = word_res_it.data (); if (!fixsp_check_for_fp_noise_space || word_res->word->flag (W_REP_CHAR) || word_res->combination || word_res->part_of_combo || !word_res->word->flag (W_DONT_CHOP)) return; blob_index = worst_noise_blob (word_res, &junk); if (blob_index < 0) return; #ifndef SECURE_NAMES if (debug_fix_space_level > 1) { tprintf ("FP fixspace working on \"%s\"\n", word_res->best_choice->string ().string ()); } #endif gblob_sort_list ((PBLOB_LIST *) word_res->word->rej_cblob_list (), FALSE); sub_word_list_it.add_after_stay_put (word_res_it.extract ()); fix_noisy_space_list(sub_word_list, row); new_length = sub_word_list.length (); word_res_it.add_list_before (&sub_word_list); for (; (!word_res_it.at_last () && (new_length > 1)); new_length--) { word_res_it.forward (); } }
/** * break_noisiest_blob_word() * Find the word with the blob which looks like the worst noise. * Break the word into two, deleting the noise blob. */ void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) { WERD_RES_IT word_it(&words); WERD_RES_IT worst_word_it; float worst_noise_score = 9999; int worst_blob_index = -1; // Noisiest blob of noisiest wd int blob_index; // of wds noisiest blob float noise_score; // of wds noisiest blob WERD_RES *word_res; C_BLOB_IT blob_it; C_BLOB_IT rej_cblob_it; C_BLOB_LIST new_blob_list; C_BLOB_IT new_blob_it; C_BLOB_IT new_rej_cblob_it; WERD *new_word; inT16 start_of_noise_blob; inT16 i; for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { blob_index = worst_noise_blob(word_it.data(), &noise_score); if (blob_index > -1 && worst_noise_score > noise_score) { worst_noise_score = noise_score; worst_blob_index = blob_index; worst_word_it = word_it; } } if (worst_blob_index < 0) { words.clear(); // signal termination return; } /* Now split the worst_word_it */ word_res = worst_word_it.data(); /* Move blobs before noise blob to a new bloblist */ new_blob_it.set_to_list(&new_blob_list); blob_it.set_to_list(word_res->word->cblob_list()); for (i = 0; i < worst_blob_index; i++, blob_it.forward()) { new_blob_it.add_after_then_move(blob_it.extract()); } start_of_noise_blob = blob_it.data()->bounding_box().left(); delete blob_it.extract(); // throw out noise blob new_word = new WERD(&new_blob_list, word_res->word); new_word->set_flag(W_EOL, FALSE); word_res->word->set_flag(W_BOL, FALSE); word_res->word->set_blanks(1); // After break new_rej_cblob_it.set_to_list(new_word->rej_cblob_list()); rej_cblob_it.set_to_list(word_res->word->rej_cblob_list()); for (; (!rej_cblob_it.empty() && (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob)); rej_cblob_it.forward()) { new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract()); } WERD_RES* new_word_res = new WERD_RES(new_word); new_word_res->combination = TRUE; worst_word_it.add_before_then_move(new_word_res); word_res->ClearResults(); }