TBOX C_BLOB::bounding_box() { //bounding box C_OUTLINE *outline; //current outline C_OUTLINE_IT it = &outlines; //outlines of blob TBOX box; //bounding box for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) { outline = it.data (); box += outline->bounding_box (); } return box; }
// Adds the selected outlines to the indcated real blobs, and puts the rest // back in rej_cblobs where they came from. Where the target_blobs entry is // nullptr, a run of wanted outlines is put into a single new blob. // Ownership of the outlines is transferred back to the word. (Hence // GenericVector and not PointerVector.) // Returns true if any new blob was added to the start of the word, which // suggests that it might need joining to the word before it, and likewise // sets make_next_word_fuzzy true if any new blob was added to the end. bool WERD::AddSelectedOutlines(const GenericVector<bool>& wanted, const GenericVector<C_BLOB*>& target_blobs, const GenericVector<C_OUTLINE*>& outlines, bool* make_next_word_fuzzy) { bool outline_added_to_start = false; if (make_next_word_fuzzy != nullptr) *make_next_word_fuzzy = false; C_BLOB_IT rej_it(&rej_cblobs); for (int i = 0; i < outlines.size(); ++i) { C_OUTLINE* outline = outlines[i]; if (outline == nullptr) continue; // Already used it. if (wanted[i]) { C_BLOB* target_blob = target_blobs[i]; TBOX noise_box = outline->bounding_box(); if (target_blob == nullptr) { target_blob = new C_BLOB(outline); // Need to find the insertion point. C_BLOB_IT blob_it(&cblobs); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* blob = blob_it.data(); TBOX blob_box = blob->bounding_box(); if (blob_box.left() > noise_box.left()) { if (blob_it.at_first() && !flag(W_FUZZY_SP) && !flag(W_FUZZY_NON)) { // We might want to join this word to its predecessor. outline_added_to_start = true; } blob_it.add_before_stay_put(target_blob); break; } } if (blob_it.cycled_list()) { blob_it.add_to_end(target_blob); if (make_next_word_fuzzy != nullptr) *make_next_word_fuzzy = true; } // Add all consecutive wanted, but null-blob outlines to same blob. C_OUTLINE_IT ol_it(target_blob->out_list()); while (i + 1 < outlines.size() && wanted[i + 1] && target_blobs[i + 1] == nullptr) { ++i; ol_it.add_to_end(outlines[i]); } } else { // Insert outline into this blob. C_OUTLINE_IT ol_it(target_blob->out_list()); ol_it.add_to_end(outline); } } else { // Put back on noise list. rej_it.add_to_end(new C_BLOB(outline)); } } return outline_added_to_start; }
void close_chopped_cfragments( //chop the outline C_OUTLINE_FRAG_LIST *frags, //list to clear C_OUTLINE_LIST *children, //potential children float pitch_error, //allowed shrinkage C_OUTLINE_IT *dest_it //output list ) { //iterator C_OUTLINE_FRAG_IT frag_it = frags; C_OUTLINE_FRAG *bottom_frag; //bottom of cut C_OUTLINE_FRAG *top_frag; //top of cut C_OUTLINE *outline; //new outline C_OUTLINE *child; //current child C_OUTLINE_IT child_it = children; C_OUTLINE_IT olchild_it; //children of outline while (!frag_it.empty()) { frag_it.move_to_first(); // get bottom one bottom_frag = frag_it.extract(); frag_it.forward(); top_frag = frag_it.data(); // look at next if ((bottom_frag->steps == nullptr && top_frag->steps == nullptr) || (bottom_frag->steps != nullptr && top_frag->steps != nullptr)) { if (frag_it.data_relative(1)->ycoord == top_frag->ycoord) frag_it.forward(); } top_frag = frag_it.extract(); if (top_frag->other_end != bottom_frag) { outline = join_chopped_fragments(bottom_frag, top_frag); ASSERT_HOST(outline == nullptr); } else { outline = join_chopped_fragments(bottom_frag, top_frag); if (outline != nullptr) { olchild_it.set_to_list(outline->child()); for (child_it.mark_cycle_pt(); !child_it.cycled_list(); child_it.forward()) { child = child_it.data(); if (*child < *outline) olchild_it.add_to_end(child_it.extract()); } if (outline->bounding_box().width() > pitch_error) dest_it->add_after_then_move(outline); else delete outline; // Make it disappear. } } } while (!child_it.empty ()) { dest_it->add_after_then_move (child_it.extract ()); child_it.forward (); } }
void fill_buckets( // find blobs C_OUTLINE_LIST *outlines, // outlines in block OL_BUCKETS *buckets // output buckets ) { TBOX ol_box; // outline box C_OUTLINE_IT out_it = outlines; // iterator C_OUTLINE_IT bucket_it; // iterator in bucket C_OUTLINE *outline; // current outline for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) { outline = out_it.extract(); // take off list // get box ol_box = outline->bounding_box(); bucket_it.set_to_list((*buckets) (ol_box.left(), ol_box.bottom())); bucket_it.add_to_end(outline); } }
// Removes noise from the word by moving small outlines to the rej_cblobs // list, based on the size_threshold. void WERD::CleanNoise(float size_threshold) { C_BLOB_IT blob_it(&cblobs); C_BLOB_IT rej_it(&rej_cblobs); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* blob = blob_it.data(); C_OUTLINE_IT ol_it(blob->out_list()); for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) { C_OUTLINE* outline = ol_it.data(); TBOX ol_box = outline->bounding_box(); int ol_size = ol_box.width() > ol_box.height() ? ol_box.width() : ol_box.height(); if (ol_size < size_threshold) { // This outline is too small. Move it to a separate blob in the // reject blobs list. C_BLOB* rej_blob = new C_BLOB(ol_it.extract()); rej_it.add_after_then_move(rej_blob); } } if (blob->out_list()->empty()) delete blob_it.extract(); } }
void fixed_split_coutline( //chop the outline C_OUTLINE *srcline, //source outline int16_t chop_coord, //place to chop float pitch_error, //allowed deviation C_OUTLINE_IT *left_it, //left half of chop C_OUTLINE_IT *right_it //right half of chop ) { C_OUTLINE *child; //child outline TBOX srcbox; //box of outline C_OUTLINE_LIST left_ch; //left children C_OUTLINE_LIST right_ch; //right children C_OUTLINE_FRAG_LIST left_frags;//chopped fragments C_OUTLINE_FRAG_LIST right_frags;; C_OUTLINE_IT left_ch_it = &left_ch; //for whole children C_OUTLINE_IT right_ch_it = &right_ch; //for holes C_OUTLINE_IT child_it = srcline->child (); srcbox = srcline->bounding_box(); if (srcbox.left() + srcbox.right() <= chop_coord * 2 && srcbox.right() < chop_coord + pitch_error) { // Whole outline is in the left side or not far over the chop_coord, // so put the whole thing on the left. left_it->add_after_then_move(srcline); } else if (srcbox.left() + srcbox.right() > chop_coord * 2 && srcbox.left () > chop_coord - pitch_error) { // Whole outline is in the right side or not far over the chop_coord, // so put the whole thing on the right. right_it->add_before_stay_put(srcline); } else { // Needs real chopping. if (fixed_chop_coutline(srcline, chop_coord, pitch_error, &left_frags, &right_frags)) { for (child_it.mark_cycle_pt(); !child_it.cycled_list(); child_it.forward()) { child = child_it.extract(); srcbox = child->bounding_box(); if (srcbox.right() < chop_coord) { // Whole child is on the left. left_ch_it.add_after_then_move(child); } else if (srcbox.left() > chop_coord) { // Whole child is on the right. right_ch_it.add_after_then_move (child); } else { // No pitch_error is allowed when chopping children to prevent // impossible outlines from being created. if (fixed_chop_coutline(child, chop_coord, 0.0f, &left_frags, &right_frags)) { delete child; } else { if (srcbox.left() + srcbox.right() <= chop_coord * 2) left_ch_it.add_after_then_move(child); else right_ch_it.add_after_then_move(child); } } } close_chopped_cfragments(&left_frags, &left_ch, pitch_error, left_it); close_chopped_cfragments(&right_frags, &right_ch, pitch_error, right_it); ASSERT_HOST(left_ch.empty() && right_ch.empty()); // No children left. delete srcline; // Smashed up. } else { // Chop failed. Just use middle coord. if (srcbox.left() + srcbox.right() <= chop_coord * 2) left_it->add_after_then_move(srcline); // Stick whole in left. else right_it->add_before_stay_put(srcline); } } }
inT32 OL_BUCKETS::count_children( // recursive count C_OUTLINE *outline, // parent outline inT32 max_count // max output ) { BOOL8 parent_box; // could it be boxy inT16 xmin, xmax; // coord limits inT16 ymin, ymax; inT16 xindex, yindex; // current bucket C_OUTLINE *child; // current child inT32 child_count; // no of children inT32 grandchild_count; // no of grandchildren inT32 parent_area; // potential box FLOAT32 max_parent_area; // potential box inT32 child_area; // current child inT32 child_length; // current child TBOX olbox; C_OUTLINE_IT child_it; // search iterator olbox = outline->bounding_box(); xmin =(olbox.left() - bl.x()) / BUCKETSIZE; xmax =(olbox.right() - bl.x()) / BUCKETSIZE; ymin =(olbox.bottom() - bl.y()) / BUCKETSIZE; ymax =(olbox.top() - bl.y()) / BUCKETSIZE; child_count = 0; grandchild_count = 0; parent_area = 0; max_parent_area = 0; parent_box = TRUE; for (yindex = ymin; yindex <= ymax; yindex++) { for (xindex = xmin; xindex <= xmax; xindex++) { child_it.set_to_list(&buckets[yindex * bxdim + xindex]); if (child_it.empty()) continue; for (child_it.mark_cycle_pt(); !child_it.cycled_list(); child_it.forward()) { child = child_it.data(); if (child != outline && *child < *outline) { child_count++; if (child_count <= max_count) { int max_grand =(max_count - child_count) / edges_children_per_grandchild; if (max_grand > 0) grandchild_count += count_children(child, max_grand) * edges_children_per_grandchild; else grandchild_count += count_children(child, 1); } if (child_count + grandchild_count > max_count) { if (edges_debug) tprintf("Discarding parent with child count=%d, gc=%d\n", child_count,grandchild_count); return child_count + grandchild_count; } if (parent_area == 0) { parent_area = outline->outer_area(); if (parent_area < 0) parent_area = -parent_area; max_parent_area = outline->bounding_box().area() * edges_boxarea; if (parent_area < max_parent_area) parent_box = FALSE; } if (parent_box && (!edges_children_fix || child->bounding_box().height() > edges_min_nonhole)) { child_area = child->outer_area(); if (child_area < 0) child_area = -child_area; if (edges_children_fix) { if (parent_area - child_area < max_parent_area) { parent_box = FALSE; continue; } if (grandchild_count > 0) { if (edges_debug) tprintf("Discarding parent of area %d, child area=%d, max%g " "with gc=%d\n", parent_area, child_area, max_parent_area, grandchild_count); return max_count + 1; } child_length = child->pathlength(); if (child_length * child_length > child_area * edges_patharea_ratio) { if (edges_debug) tprintf("Discarding parent of area %d, child area=%d, max%g " "with child length=%d\n", parent_area, child_area, max_parent_area, child_length); return max_count + 1; } } if (child_area < child->bounding_box().area() * edges_childarea) { if (edges_debug) tprintf("Discarding parent of area %d, child area=%d, max%g " "with child rect=%d\n", parent_area, child_area, max_parent_area, child->bounding_box().area()); return max_count + 1; } } } } } } return child_count + grandchild_count; }
void Textord::clean_noise_from_words( //remove empties ROW *row //row to clean ) { TBOX blob_box; //bounding box C_BLOB *blob; //current blob C_OUTLINE *outline; //current outline WERD *word; //current word int32_t blob_size; //biggest size int32_t trans_count; //no of transitions int32_t trans_threshold; //noise tolerance int32_t dot_count; //small objects int32_t norm_count; //normal objects int32_t dud_words; //number discarded int32_t ok_words; //number remaining int32_t word_index; //current word //words of row WERD_IT word_it = row->word_list (); C_BLOB_IT blob_it; //blob iterator C_OUTLINE_IT out_it; //outline iterator ok_words = word_it.length (); if (ok_words == 0 || textord_no_rejects) return; // was it chucked std::vector<int8_t> word_dud(ok_words); dud_words = 0; ok_words = 0; word_index = 0; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); //current word dot_count = 0; norm_count = 0; //blobs in word blob_it.set_to_list (word->cblob_list ()); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (!word->flag (W_DONT_CHOP)) { //get outlines out_it.set_to_list (blob->out_list ()); for (out_it.mark_cycle_pt (); !out_it.cycled_list (); out_it.forward ()) { outline = out_it.data (); blob_box = outline->bounding_box (); blob_size = blob_box.width () > blob_box.height ()? blob_box.width () : blob_box. height(); if (blob_size < textord_noise_sizelimit * row->x_height ()) dot_count++; //count smal outlines if (!outline->child ()->empty () && blob_box.height () < (1 + textord_noise_syfract) * row->x_height () && blob_box.height () > (1 - textord_noise_syfract) * row->x_height () && blob_box.width () < (1 + textord_noise_sxfract) * row->x_height () && blob_box.width () > (1 - textord_noise_sxfract) * row->x_height ()) norm_count++; //count smal outlines } } else norm_count++; blob_box = blob->bounding_box (); blob_size = blob_box.width () > blob_box.height ()? blob_box.width () : blob_box.height (); if (blob_size >= textord_noise_sizelimit * row->x_height () && blob_size < row->x_height () * 2) { trans_threshold = blob_size / textord_noise_sizefraction; trans_count = blob->count_transitions (trans_threshold); if (trans_count < textord_noise_translimit) norm_count++; } else if (blob_box.height () > row->x_height () * 2 && (!word_it.at_first () || !blob_it.at_first ())) dot_count += 2; } if (dot_count > 2 && !word->flag(W_REP_CHAR)) { if (dot_count > norm_count * textord_noise_normratio * 2) word_dud[word_index] = 2; else if (dot_count > norm_count * textord_noise_normratio) word_dud[word_index] = 1; else word_dud[word_index] = 0; } else { word_dud[word_index] = 0; } if (word_dud[word_index] == 2) dud_words++; else ok_words++; word_index++; } word_index = 0; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { if (word_dud[word_index] == 2 || (word_dud[word_index] == 1 && dud_words > ok_words)) { word = word_it.data(); // Current word. // Previously we threw away the entire word. // Now just aggressively throw all small blobs into the reject list, where // the classifier can decide whether they are actually needed. word->CleanNoise(textord_noise_sizelimit * row->x_height()); } word_index++; } }
bool Textord::clean_noise_from_row( //remove empties ROW* row //row to clean ) { bool testing_on; TBOX blob_box; //bounding box C_BLOB *blob; //current blob C_OUTLINE *outline; //current outline WERD *word; //current word int32_t blob_size; //biggest size int32_t trans_count = 0; //no of transitions int32_t trans_threshold; //noise tolerance int32_t dot_count; //small objects int32_t norm_count; //normal objects int32_t super_norm_count; //real char-like //words of row WERD_IT word_it = row->word_list (); C_BLOB_IT blob_it; //blob iterator C_OUTLINE_IT out_it; //outline iterator testing_on = textord_test_y > row->base_line (textord_test_x) && textord_show_blobs && textord_test_y < row->base_line (textord_test_x) + row->x_height (); dot_count = 0; norm_count = 0; super_norm_count = 0; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); //current word //blobs in word blob_it.set_to_list (word->cblob_list ()); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (!word->flag (W_DONT_CHOP)) { //get outlines out_it.set_to_list (blob->out_list ()); for (out_it.mark_cycle_pt (); !out_it.cycled_list (); out_it.forward ()) { outline = out_it.data (); blob_box = outline->bounding_box (); blob_size = blob_box.width () > blob_box.height ()? blob_box.width () : blob_box. height(); if (blob_size < textord_noise_sizelimit * row->x_height ()) dot_count++; //count smal outlines if (!outline->child ()->empty () && blob_box.height () < (1 + textord_noise_syfract) * row->x_height () && blob_box.height () > (1 - textord_noise_syfract) * row->x_height () && blob_box.width () < (1 + textord_noise_sxfract) * row->x_height () && blob_box.width () > (1 - textord_noise_sxfract) * row->x_height ()) super_norm_count++; //count smal outlines } } else super_norm_count++; blob_box = blob->bounding_box (); blob_size = blob_box.width () > blob_box.height ()? blob_box.width () : blob_box.height (); if (blob_size >= textord_noise_sizelimit * row->x_height () && blob_size < row->x_height () * 2) { trans_threshold = blob_size / textord_noise_sizefraction; trans_count = blob->count_transitions (trans_threshold); if (trans_count < textord_noise_translimit) norm_count++; } else if (blob_box.height () > row->x_height () * 2 && (!word_it.at_first () || !blob_it.at_first ())) dot_count += 2; if (testing_on) { tprintf ("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n", blob_box.left (), blob_box.bottom (), blob_box.right (), blob_box.top (), blob->out_list ()->length (), trans_count, blob_box.bottom () - row->base_line (blob_box.left ())); } } } if (textord_noise_debug) { tprintf ("Row ending at (%d,%g):", blob_box.right (), row->base_line (blob_box.right ())); tprintf (" R=%g, dc=%d, nc=%d, %s\n", norm_count > 0 ? (float) dot_count / norm_count : 9999, dot_count, norm_count, dot_count > norm_count * textord_noise_normratio && dot_count > 2 ? "REJECTED" : "ACCEPTED"); } return super_norm_count < textord_noise_sncount && dot_count > norm_count * textord_noise_rowratio && dot_count > 2; }
void Textord::clean_noise_from_words( //remove empties ROW *row //row to clean ) { TBOX blob_box; //bounding box inT8 *word_dud; //was it chucked C_BLOB *blob; //current blob C_OUTLINE *outline; //current outline WERD *word; //current word inT32 blob_size; //biggest size inT32 trans_count; //no of transitions inT32 trans_threshold; //noise tolerance inT32 dot_count; //small objects inT32 norm_count; //normal objects inT32 dud_words; //number discarded inT32 ok_words; //number remaining inT32 word_index; //current word //words of row WERD_IT word_it = row->word_list (); C_BLOB_IT blob_it; //blob iterator C_OUTLINE_IT out_it; //outline iterator ok_words = word_it.length (); if (ok_words == 0 || textord_no_rejects) return; word_dud = (inT8 *) alloc_mem (ok_words * sizeof (inT8)); dud_words = 0; ok_words = 0; word_index = 0; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); //current word dot_count = 0; norm_count = 0; //blobs in word blob_it.set_to_list (word->cblob_list ()); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (!word->flag (W_DONT_CHOP)) { //get outlines out_it.set_to_list (blob->out_list ()); for (out_it.mark_cycle_pt (); !out_it.cycled_list (); out_it.forward ()) { outline = out_it.data (); blob_box = outline->bounding_box (); blob_size = blob_box.width () > blob_box.height ()? blob_box.width () : blob_box. height(); if (blob_size < textord_noise_sizelimit * row->x_height ()) dot_count++; //count smal outlines if (!outline->child ()->empty () && blob_box.height () < (1 + textord_noise_syfract) * row->x_height () && blob_box.height () > (1 - textord_noise_syfract) * row->x_height () && blob_box.width () < (1 + textord_noise_sxfract) * row->x_height () && blob_box.width () > (1 - textord_noise_sxfract) * row->x_height ()) norm_count++; //count smal outlines } } else norm_count++; blob_box = blob->bounding_box (); blob_size = blob_box.width () > blob_box.height ()? blob_box.width () : blob_box.height (); if (blob_size >= textord_noise_sizelimit * row->x_height () && blob_size < row->x_height () * 2) { trans_threshold = blob_size / textord_noise_sizefraction; trans_count = blob->count_transitions (trans_threshold); if (trans_count < textord_noise_translimit) norm_count++; } else if (blob_box.height () > row->x_height () * 2 && (!word_it.at_first () || !blob_it.at_first ())) dot_count += 2; } if (dot_count > 2) { if (dot_count > norm_count * textord_noise_normratio * 2) word_dud[word_index] = 2; else if (dot_count > norm_count * textord_noise_normratio) word_dud[word_index] = 1; else word_dud[word_index] = 0; } else word_dud[word_index] = 0; if (word_dud[word_index] == 2) dud_words++; else ok_words++; word_index++; } word_index = 0; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { if (word_dud[word_index] == 2 || (word_dud[word_index] == 1 && dud_words > ok_words)) { word = word_it.data (); //current word //rejected blobs blob_it.set_to_list (word->rej_cblob_list ()); //move from blobs blob_it.add_list_after (word->cblob_list ()); } word_index++; } free_mem(word_dud); }