// Remove outlines that are a tiny fraction in either width or height // of the word height. void Textord::clean_small_noise_from_words(ROW *row) { WERD_IT word_it(row->word_list()); for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { WERD* word = word_it.data(); int min_size = static_cast<int>( textord_noise_hfract * word->bounding_box().height() + 0.5); C_BLOB_IT blob_it(word->cblob_list()); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* blob = blob_it.data(); C_OUTLINE_IT out_it(blob->out_list()); for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) { C_OUTLINE* outline = out_it.data(); outline->RemoveSmallRecursive(min_size, &out_it); } if (blob->out_list()->empty()) { delete blob_it.extract(); } } if (word->cblob_list()->empty()) { if (!word_it.at_last()) { // The next word is no longer a fuzzy non space if it was before, // since the word before is about to be deleted. WERD* next_word = word_it.data_relative(1); if (next_word->flag(W_FUZZY_NON)) { next_word->set_flag(W_FUZZY_NON, false); } } delete word_it.extract(); } } }
void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST* real_rows) { TO_ROW_IT to_row_it(rows); TO_ROW* row = to_row_it.data(); // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready // to create the word. C_BLOB_LIST cblobs; C_BLOB_IT cblob_it(&cblobs); BLOBNBOX_IT box_it(row->blob_list()); for (;!box_it.empty(); box_it.forward()) { BLOBNBOX* bblob= box_it.extract(); if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) { if (bblob->cblob() != NULL) { C_OUTLINE_IT cout_it(cblob_it.data()->out_list()); cout_it.move_to_last(); cout_it.add_list_after(bblob->cblob()->out_list()); delete bblob->cblob(); } } else { if (bblob->cblob() != NULL) cblob_it.add_after_then_move(bblob->cblob()); delete bblob; } } // Convert the TO_ROW to a ROW. ROW* real_row = new ROW(row, static_cast<inT16>(row->kern_size), static_cast<inT16>(row->space_size)); WERD_IT word_it(real_row->word_list()); WERD* word = new WERD(&cblobs, 0, NULL); word->set_flag(W_BOL, TRUE); word->set_flag(W_EOL, TRUE); word_it.add_after_then_move(word); ROW_IT row_it(real_rows); row_it.add_after_then_move(real_row); }
//yangjing01 modified : bool TAL_make_single_word(bool one_blob, TO_ROW_LIST* rows, ROW_LIST* real_rows) { TO_ROW_IT to_row_it(rows); ROW_IT row_it(real_rows); //to_real_row is the real row information of single row or single char mode TO_ROW* real_to_row = NULL; float row_max_height = 0.0; for (to_row_it.mark_cycle_pt(); !to_row_it.cycled_list(); to_row_it.forward()){ TO_ROW* row = to_row_it.data(); float row_min_y = row->min_y(); float row_max_y = row->max_y(); float row_height = abs(row_max_y - row_min_y); if (real_to_row == NULL || row_height > row_max_height || fabs(row_height - row_max_height) < 1.0f){ row_max_height = row_height; real_to_row = row; } } if (real_to_row == NULL){ return false; } C_BLOB_LIST cblobs; C_BLOB_IT cblob_it(&cblobs); BLOBNBOX_IT box_it(real_to_row->blob_list()); for (; !box_it.empty(); box_it.forward()){ BLOBNBOX* bblob = box_it.extract(); if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) { if (bblob->cblob() != NULL){ C_OUTLINE_IT cout_it(cblob_it.data()->out_list()); cout_it.move_to_last(); cout_it.add_list_after(bblob->cblob()->out_list()); delete bblob->cblob(); } } else { if (bblob->cblob() != NULL) cblob_it.add_after_then_move(bblob->cblob()); } delete bblob; } // Convert the TO_ROW to a ROW. ROW* real_row = new ROW(real_to_row, static_cast<inT16>(real_to_row->kern_size), static_cast<inT16>(real_to_row->space_size)); WERD_IT word_it(real_row->word_list()); WERD* word = new WERD(&cblobs, 0, NULL); word->set_flag(W_BOL, TRUE); word->set_flag(W_EOL, TRUE); word->set_flag(W_DONT_CHOP, one_blob); word_it.add_after_then_move(word); row_it.add_after_then_move(real_row); return true; }
void match_current_words(WERD_RES_LIST &words, ROW *row) { WERD_RES_IT word_it(&words); WERD_RES *word; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); if ((!word->part_of_combo) && (word->outword == NULL)) classify_word_pass2(word, row); } }
static void clear_any_old_text(BLOCK_LIST *block_list) { BLOCK_IT block_it(block_list); for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { ROW_IT row_it(block_it.data()->row_list()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { WERD_IT word_it(row_it.data()->word_list()); for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { word_it.data()->set_text(""); } } } }
void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK* block) { WERD_RES_IT word_it(&words); WERD_RES *word; // Since we are not using PAGE_RES to iterate over words, we need to update // prev_word_best_choice_ before calling classify_word_pass2(). prev_word_best_choice_ = NULL; for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { word = word_it.data(); if ((!word->part_of_combo) && (word->box_word == NULL)) { classify_word_and_language(&Tesseract::classify_word_pass2, block, row, word); } prev_word_best_choice_ = word->best_choice; } }
void Phrase::CreateFromString(const std::vector<FactorType> &factorOrder, const StringPiece &phraseString, const StringPiece &factorDelimiter) { FactorCollection &factorCollection = FactorCollection::Instance(); for (util::TokenIter<util::AnyCharacter, true> word_it(phraseString, util::AnyCharacter(" \t")); word_it; ++word_it) { Word &word = AddWord(); size_t index = 0; for (util::TokenIter<util::MultiCharacter, false> factor_it(*word_it, util::MultiCharacter(factorDelimiter)); factor_it && (index < factorOrder.size()); ++factor_it, ++index) { word[factorOrder[index]] = factorCollection.AddFactor(*factor_it); } if (index != factorOrder.size()) { TRACE_ERR( "[ERROR] Malformed input: '" << *word_it << "'" << std::endl << "In '" << phraseString << "'" << endl << " Expected input to have words composed of " << factorOrder.size() << " factor(s) (form FAC1|FAC2|...)" << std::endl << " but instead received input with " << index << " factor(s).\n"); abort(); } } }
inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list) { WERD_RES_IT word_it(&word_res_list); WERD_RES *word; PBLOB_IT blob_it; inT16 word_length; inT16 score = 0; inT16 i; inT16 offset; const char *chs; float small_limit = bln_x_height * fixsp_small_outlines_size; if (!fixsp_fp_eval) return (eval_word_spacing (word_res_list)); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); word_length = word->reject_map.length (); chs = word->best_choice->string ().string (); if ((word->done || word->tess_accepted) || (word->best_choice->permuter () == SYSTEM_DAWG_PERM) || (word->best_choice->permuter () == FREQ_DAWG_PERM) || (word->best_choice->permuter () == USER_DAWG_PERM) || (safe_dict_word (chs) > 0)) { blob_it.set_to_list (word->outword->blob_list ()); for (i = 0, offset = 0; i < word_length; offset += word->best_choice->lengths()[i++], blob_it.forward ()) { if ((chs[offset] == ' ') || (blob_noise_score (blob_it.data ()) < small_limit)) score -= 1; //penalise possibly erroneous non-space else if (word->reject_map[i].accepted ()) score++; } } } if (score < 0) score = 0; return score; }
inT16 Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) { WERD_RES_IT word_it(&word_res_list); WERD_RES *word; inT16 word_length; inT16 score = 0; inT16 i; float small_limit = kBlnXHeight * fixsp_small_outlines_size; for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { word = word_it.data(); if (word->rebuild_word == NULL) continue; // Can't handle cube words. word_length = word->reject_map.length(); if (word->done || word->tess_accepted || word->best_choice->permuter() == SYSTEM_DAWG_PERM || word->best_choice->permuter() == FREQ_DAWG_PERM || word->best_choice->permuter() == USER_DAWG_PERM || safe_dict_word(word) > 0) { TBLOB* blob = word->rebuild_word->blobs; UNICHAR_ID space = word->uch_set->unichar_to_id(" "); for (i = 0; i < word->best_choice->length() && blob != NULL; ++i, blob = blob->next) { if (word->best_choice->unichar_id(i) == space || blob_noise_score(blob) < small_limit) { score -= 1; // penalise possibly erroneous non-space } else if (word->reject_map[i].accepted()) { score++; } } } } if (score < 0) score = 0; return score; }
/** * break_noisiest_blob_word() * Find the word with the blob which looks like the worst noise. * Break the word into two, deleting the noise blob. */ void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) { WERD_RES_IT word_it(&words); WERD_RES_IT worst_word_it; float worst_noise_score = 9999; int worst_blob_index = -1; // Noisiest blob of noisiest wd int blob_index; // of wds noisiest blob float noise_score; // of wds noisiest blob WERD_RES *word_res; C_BLOB_IT blob_it; C_BLOB_IT rej_cblob_it; C_BLOB_LIST new_blob_list; C_BLOB_IT new_blob_it; C_BLOB_IT new_rej_cblob_it; WERD *new_word; inT16 start_of_noise_blob; inT16 i; for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { blob_index = worst_noise_blob(word_it.data(), &noise_score); if (blob_index > -1 && worst_noise_score > noise_score) { worst_noise_score = noise_score; worst_blob_index = blob_index; worst_word_it = word_it; } } if (worst_blob_index < 0) { words.clear(); // signal termination return; } /* Now split the worst_word_it */ word_res = worst_word_it.data(); /* Move blobs before noise blob to a new bloblist */ new_blob_it.set_to_list(&new_blob_list); blob_it.set_to_list(word_res->word->cblob_list()); for (i = 0; i < worst_blob_index; i++, blob_it.forward()) { new_blob_it.add_after_then_move(blob_it.extract()); } start_of_noise_blob = blob_it.data()->bounding_box().left(); delete blob_it.extract(); // throw out noise blob new_word = new WERD(&new_blob_list, word_res->word); new_word->set_flag(W_EOL, FALSE); word_res->word->set_flag(W_BOL, FALSE); word_res->word->set_blanks(1); // After break new_rej_cblob_it.set_to_list(new_word->rej_cblob_list()); rej_cblob_it.set_to_list(word_res->word->rej_cblob_list()); for (; (!rej_cblob_it.empty() && (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob)); rej_cblob_it.forward()) { new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract()); } WERD_RES* new_word_res = new WERD_RES(new_word); new_word_res->combination = TRUE; worst_word_it.add_before_then_move(new_word_res); word_res->ClearResults(); }
/** * @name transform_to_next_perm() * Examines the current word list to find the smallest word gap size. Then walks * the word list closing any gaps of this size by either inserted new * combination words, or extending existing ones. * * The routine COULD be limited to stop it building words longer than N blobs. * * If there are no more gaps then it DELETES the entire list and returns the * empty list to cause termination. */ void transform_to_next_perm(WERD_RES_LIST &words) { WERD_RES_IT word_it(&words); WERD_RES_IT prev_word_it(&words); WERD_RES *word; WERD_RES *prev_word; WERD_RES *combo; WERD *copy_word; inT16 prev_right = -MAX_INT16; TBOX box; inT16 gap; inT16 min_gap = MAX_INT16; for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { word = word_it.data(); if (!word->part_of_combo) { box = word->word->bounding_box(); if (prev_right > -MAX_INT16) { gap = box.left() - prev_right; if (gap < min_gap) min_gap = gap; } prev_right = box.right(); } } if (min_gap < MAX_INT16) { prev_right = -MAX_INT16; // back to start word_it.set_to_list(&words); // Note: we can't use cycle_pt due to inserted combos at start of list. for (; (prev_right == -MAX_INT16) || !word_it.at_first(); word_it.forward()) { word = word_it.data(); if (!word->part_of_combo) { box = word->word->bounding_box(); if (prev_right > -MAX_INT16) { gap = box.left() - prev_right; if (gap <= min_gap) { prev_word = prev_word_it.data(); if (prev_word->combination) { combo = prev_word; } else { /* Make a new combination and insert before * the first word being joined. */ copy_word = new WERD; *copy_word = *(prev_word->word); // deep copy combo = new WERD_RES(copy_word); combo->combination = TRUE; combo->x_height = prev_word->x_height; prev_word->part_of_combo = TRUE; prev_word_it.add_before_then_move(combo); } combo->word->set_flag(W_EOL, word->word->flag(W_EOL)); if (word->combination) { combo->word->join_on(word->word); // Move blobs to combo // old combo no longer needed delete word_it.extract(); } else { // Copy current wd to combo combo->copy_on(word); word->part_of_combo = TRUE; } combo->done = FALSE; combo->ClearResults(); } else { prev_word_it = word_it; // catch up } } prev_right = box.right(); } } } else { words.clear(); // signal termination } }