// Inserts all the blobs from the given list, with x and y spreading, // without removing from the source list, so ownership remains with the // source list. void BlobGrid::InsertBlobList(BLOBNBOX_LIST * blobs) { BLOBNBOX_IT blob_it(blobs); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { BLOBNBOX *blob = blob_it.data(); if (!blob->joined_to_prev()) InsertBBox(true, true, blob); } }
//yangjing01 modified : bool TAL_make_single_word(bool one_blob, TO_ROW_LIST* rows, ROW_LIST* real_rows) { TO_ROW_IT to_row_it(rows); ROW_IT row_it(real_rows); //to_real_row is the real row information of single row or single char mode TO_ROW* real_to_row = NULL; float row_max_height = 0.0; for (to_row_it.mark_cycle_pt(); !to_row_it.cycled_list(); to_row_it.forward()){ TO_ROW* row = to_row_it.data(); float row_min_y = row->min_y(); float row_max_y = row->max_y(); float row_height = abs(row_max_y - row_min_y); if (real_to_row == NULL || row_height > row_max_height || fabs(row_height - row_max_height) < 1.0f){ row_max_height = row_height; real_to_row = row; } } if (real_to_row == NULL){ return false; } C_BLOB_LIST cblobs; C_BLOB_IT cblob_it(&cblobs); BLOBNBOX_IT box_it(real_to_row->blob_list()); for (; !box_it.empty(); box_it.forward()){ BLOBNBOX* bblob = box_it.extract(); if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) { if (bblob->cblob() != NULL){ C_OUTLINE_IT cout_it(cblob_it.data()->out_list()); cout_it.move_to_last(); cout_it.add_list_after(bblob->cblob()->out_list()); delete bblob->cblob(); } } else { if (bblob->cblob() != NULL) cblob_it.add_after_then_move(bblob->cblob()); } delete bblob; } // Convert the TO_ROW to a ROW. ROW* real_row = new ROW(real_to_row, static_cast<inT16>(real_to_row->kern_size), static_cast<inT16>(real_to_row->space_size)); WERD_IT word_it(real_row->word_list()); WERD* word = new WERD(&cblobs, 0, NULL); word->set_flag(W_BOL, TRUE); word->set_flag(W_EOL, TRUE); word->set_flag(W_DONT_CHOP, one_blob); word_it.add_after_then_move(word); row_it.add_after_then_move(real_row); return true; }
WERD *make_real_word(BLOBNBOX_IT *box_it, //iterator inT32 blobcount, //no of blobs to use BOOL8 bol, //start of line uinT8 blanks //no of blanks ) { OUTLINE_IT out_it; // outlines C_OUTLINE_IT cout_it; PBLOB_LIST blobs; // blobs in word C_BLOB_LIST cblobs; PBLOB_IT blob_it = &blobs; // iterator C_BLOB_IT cblob_it = &cblobs; WERD *word; // new word BLOBNBOX *bblob; // current blob inT32 blobindex; // in row for (blobindex = 0; blobindex < blobcount; blobindex++) { bblob = box_it->extract(); if (bblob->joined_to_prev()) { if (bblob->blob() != NULL) { out_it.set_to_list(blob_it.data()->out_list()); out_it.move_to_last(); out_it.add_list_after(bblob->blob()->out_list()); delete bblob->blob(); } else if (bblob->cblob() != NULL) { cout_it.set_to_list(cblob_it.data()->out_list()); cout_it.move_to_last(); cout_it.add_list_after(bblob->cblob()->out_list()); delete bblob->cblob(); } } else { if (bblob->blob() != NULL) blob_it.add_after_then_move(bblob->blob()); else if (bblob->cblob() != NULL) cblob_it.add_after_then_move(bblob->cblob()); } delete bblob; box_it->forward(); // next one } if (blanks < 1) blanks = 1; if (blob_it.empty()) word = new WERD(&cblobs, blanks, NULL); else word = new WERD(&blobs, blanks, NULL); if (bol) word->set_flag(W_BOL, TRUE); if (box_it->at_first()) word->set_flag(W_EOL, TRUE); // at end of line return word; }
TBOX box_next_pre_chopped( //get bounding box BLOBNBOX_IT *it //iterator to blobds ) { BLOBNBOX *blob; //current blob TBOX result; //total box blob = it->data (); result = blob->bounding_box (); do { it->forward (); blob = it->data (); } //until next real blob while (blob->joined_to_prev ()); return result; }
TBOX box_next( //get bounding box BLOBNBOX_IT *it //iterator to blobds ) { BLOBNBOX *blob; //current blob TBOX result; //total box blob = it->data (); result = blob->bounding_box (); do { it->forward (); blob = it->data (); if (blob->cblob() == NULL) //was pre-chopped result += blob->bounding_box (); } //until next real blob while ((blob->cblob() == NULL) || blob->joined_to_prev()); return result; }
WERD *make_real_word(BLOBNBOX_IT *box_it, //iterator int32_t blobcount, //no of blobs to use bool bol, //start of line uint8_t blanks //no of blanks ) { C_OUTLINE_IT cout_it; C_BLOB_LIST cblobs; C_BLOB_IT cblob_it = &cblobs; WERD *word; // new word BLOBNBOX *bblob; // current blob int32_t blobindex; // in row for (blobindex = 0; blobindex < blobcount; blobindex++) { bblob = box_it->extract(); if (bblob->joined_to_prev()) { if (bblob->cblob() != nullptr) { cout_it.set_to_list(cblob_it.data()->out_list()); cout_it.move_to_last(); cout_it.add_list_after(bblob->cblob()->out_list()); delete bblob->cblob(); } } else { if (bblob->cblob() != nullptr) cblob_it.add_after_then_move(bblob->cblob()); } delete bblob; box_it->forward(); // next one } if (blanks < 1) blanks = 1; word = new WERD(&cblobs, blanks, nullptr); if (bol) word->set_flag(W_BOL, true); if (box_it->at_first()) word->set_flag(W_EOL, true); // at end of line return word; }
int32_t row_words2( //compute space size TO_BLOCK* block, //block it came from TO_ROW* row, //row to operate on int32_t maxwidth, //max expected space size FCOORD rotation, //for drawing bool testing_on //for debug ) { bool prev_valid; //if decent size bool this_valid; //current blob big enough int32_t prev_x; //end of prev blob int32_t min_width; //min interesting width int32_t valid_count; //good gaps int32_t total_count; //total gaps int32_t cluster_count; //no of clusters int32_t prev_count; //previous cluster_count int32_t gap_index; //which cluster int32_t smooth_factor; //for smoothing stats BLOBNBOX *blob; //current blob float lower, upper; //clustering parameters ICOORD testpt; TBOX blob_box; //bounding box //iterator BLOBNBOX_IT blob_it = row->blob_list (); STATS gap_stats (0, maxwidth); //gap sizes float gaps[BLOCK_STATS_CLUSTERS]; STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1]; //clusters testpt = ICOORD (textord_test_x, textord_test_y); smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5); // if (testing_on) // tprintf("Row smooth factor=%d\n",smooth_factor); prev_valid = false; prev_x = -INT16_MAX; const bool testing_row = false; //min blob size min_width = static_cast<int32_t>(block->pr_space); total_count = 0; for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (!blob->joined_to_prev ()) { blob_box = blob->bounding_box (); this_valid = blob_box.width () >= min_width; if (this_valid && prev_valid && blob_box.left () - prev_x < maxwidth) { gap_stats.add (blob_box.left () - prev_x, 1); } total_count++; //count possibles prev_x = blob_box.right (); prev_valid = this_valid; } } valid_count = gap_stats.get_total (); if (valid_count < total_count * textord_words_minlarge) { gap_stats.clear (); prev_x = -INT16_MAX; for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (!blob->joined_to_prev ()) { blob_box = blob->bounding_box (); if (blob_box.left () - prev_x < maxwidth) { gap_stats.add (blob_box.left () - prev_x, 1); } prev_x = blob_box.right (); } } } if (gap_stats.get_total () == 0) { row->min_space = 0; //no evidence row->max_nonspace = 0; return 0; } cluster_count = 0; lower = block->xheight * words_initial_lower; upper = block->xheight * words_initial_upper; gap_stats.smooth (smooth_factor); do { prev_count = cluster_count; cluster_count = gap_stats.cluster (lower, upper, textord_spacesize_ratioprop, BLOCK_STATS_CLUSTERS, cluster_stats); } while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS); if (cluster_count < 1) { row->min_space = 0; row->max_nonspace = 0; return 0; } for (gap_index = 0; gap_index < cluster_count; gap_index++) gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5); //get medians if (testing_on) { tprintf ("cluster_count=%d:", cluster_count); for (gap_index = 0; gap_index < cluster_count; gap_index++) tprintf (" %g(%d)", gaps[gap_index], cluster_stats[gap_index + 1].get_total ()); tprintf ("\n"); } //Try to find proportional non-space and space for row. for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] > block->max_nonspace; gap_index++); if (gap_index < cluster_count) lower = gaps[gap_index]; //most frequent below else { if (testing_on) tprintf ("No cluster below block threshold!, using default=%g\n", block->pr_nonsp); lower = block->pr_nonsp; } for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] <= block->max_nonspace; gap_index++); if (gap_index < cluster_count) upper = gaps[gap_index]; //most frequent above else { if (testing_on) tprintf ("No cluster above block threshold!, using default=%g\n", block->pr_space); upper = block->pr_space; } row->min_space = static_cast<int32_t>(ceil (upper - (upper - lower) * textord_words_definite_spread)); row->max_nonspace = static_cast<int32_t>(floor (lower + (upper - lower) * textord_words_definite_spread)); row->space_threshold = (row->max_nonspace + row->min_space) / 2; row->space_size = upper; row->kern_size = lower; if (testing_on) { if (testing_row) { tprintf ("GAP STATS\n"); gap_stats.print(); tprintf ("SPACE stats\n"); cluster_stats[2].print_summary(); tprintf ("NONSPACE stats\n"); cluster_stats[1].print_summary(); } tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept (), row->min_space, upper, row->max_nonspace, lower); } return 1; }
int32_t row_words( //compute space size TO_BLOCK* block, //block it came from TO_ROW* row, //row to operate on int32_t maxwidth, //max expected space size FCOORD rotation, //for drawing bool testing_on //for debug ) { bool testing_row; //contains testpt bool prev_valid; //if decent size int32_t prev_x; //end of prev blob int32_t cluster_count; //no of clusters int32_t gap_index; //which cluster int32_t smooth_factor; //for smoothing stats BLOBNBOX *blob; //current blob float lower, upper; //clustering parameters float gaps[3]; //gap clusers ICOORD testpt; TBOX blob_box; //bounding box //iterator BLOBNBOX_IT blob_it = row->blob_list (); STATS gap_stats (0, maxwidth); STATS cluster_stats[4]; //clusters testpt = ICOORD (textord_test_x, textord_test_y); smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5); // if (testing_on) // tprintf("Row smooth factor=%d\n",smooth_factor); prev_valid = false; prev_x = -INT32_MAX; testing_row = false; for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); blob_box = blob->bounding_box (); if (blob_box.contains (testpt)) testing_row = true; gap_stats.add (blob_box.width (), 1); } gap_stats.clear (); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (!blob->joined_to_prev ()) { blob_box = blob->bounding_box (); if (prev_valid && blob_box.left () - prev_x < maxwidth) { gap_stats.add (blob_box.left () - prev_x, 1); } prev_valid = true; prev_x = blob_box.right (); } } if (gap_stats.get_total () == 0) { row->min_space = 0; //no evidence row->max_nonspace = 0; return 0; } gap_stats.smooth (smooth_factor); lower = row->xheight * textord_words_initial_lower; upper = row->xheight * textord_words_initial_upper; cluster_count = gap_stats.cluster (lower, upper, textord_spacesize_ratioprop, 3, cluster_stats); while (cluster_count < 2 && ceil (lower) < floor (upper)) { //shrink gap upper = (upper * 3 + lower) / 4; lower = (lower * 3 + upper) / 4; cluster_count = gap_stats.cluster (lower, upper, textord_spacesize_ratioprop, 3, cluster_stats); } if (cluster_count < 2) { row->min_space = 0; //no evidence row->max_nonspace = 0; return 0; } for (gap_index = 0; gap_index < cluster_count; gap_index++) gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5); //get medians if (cluster_count > 2) { if (testing_on && textord_show_initial_words) { tprintf ("Row at %g has 3 sizes of gap:%g,%g,%g\n", row->intercept (), cluster_stats[1].ile (0.5), cluster_stats[2].ile (0.5), cluster_stats[3].ile (0.5)); } lower = gaps[0]; if (gaps[1] > lower) { upper = gaps[1]; //prefer most frequent if (upper < block->xheight * textord_words_min_minspace && gaps[2] > gaps[1]) { upper = gaps[2]; } } else if (gaps[2] > lower && gaps[2] >= block->xheight * textord_words_min_minspace) upper = gaps[2]; else if (lower >= block->xheight * textord_words_min_minspace) { upper = lower; //not nice lower = gaps[1]; if (testing_on && textord_show_initial_words) { tprintf ("Had to switch most common from lower to upper!!\n"); gap_stats.print(); } } else { row->min_space = 0; //no evidence row->max_nonspace = 0; return 0; } } else { if (gaps[1] < gaps[0]) { if (testing_on && textord_show_initial_words) { tprintf ("Had to switch most common from lower to upper!!\n"); gap_stats.print(); } lower = gaps[1]; upper = gaps[0]; } else { upper = gaps[1]; lower = gaps[0]; } } if (upper < block->xheight * textord_words_min_minspace) { row->min_space = 0; //no evidence row->max_nonspace = 0; return 0; } if (upper * 3 < block->min_space * 2 + block->max_nonspace || lower * 3 > block->min_space * 2 + block->max_nonspace) { if (testing_on && textord_show_initial_words) { tprintf ("Disagreement between block and row at %g!!\n", row->intercept ()); tprintf ("Lower=%g, upper=%g, Stats:\n", lower, upper); gap_stats.print(); } } row->min_space = static_cast<int32_t>(ceil (upper - (upper - lower) * textord_words_definite_spread)); row->max_nonspace = static_cast<int32_t>(floor (lower + (upper - lower) * textord_words_definite_spread)); row->space_threshold = (row->max_nonspace + row->min_space) / 2; row->space_size = upper; row->kern_size = lower; if (testing_on && textord_show_initial_words) { if (testing_row) { tprintf ("GAP STATS\n"); gap_stats.print(); tprintf ("SPACE stats\n"); cluster_stats[2].print_summary(); tprintf ("NONSPACE stats\n"); cluster_stats[1].print_summary(); } tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept (), row->min_space, upper, row->max_nonspace, lower); } return cluster_stats[2].get_total (); }
void plot_word_decisions( //draw words ScrollView *win, //window tro draw in inT16 pitch, //of block TO_ROW *row //row to draw ) { ScrollView::Color colour = ScrollView::MAGENTA; //current colour ScrollView::Color rect_colour; //fuzzy colour inT32 prev_x; //end of prev blob inT16 blob_count; //blobs in word BLOBNBOX *blob; //current blob TBOX blob_box; //bounding box //iterator BLOBNBOX_IT blob_it = row->blob_list(); BLOBNBOX_IT start_it = blob_it;//word start rect_colour = ScrollView::BLACK; prev_x = -MAX_INT16; blob_count = 0; for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { blob = blob_it.data(); blob_box = blob->bounding_box(); if (!blob->joined_to_prev() && blob_box.left() - prev_x > row->max_nonspace) { if ((blob_box.left() - prev_x >= row->min_space || blob_box.left() - prev_x > row->space_threshold) && blob_count > 0) { if (pitch > 0 && textord_show_fixed_cuts) plot_fp_cells(win, colour, &start_it, pitch, blob_count, &row->projection, row->projection_left, row->projection_right, row->xheight * textord_projection_scale); blob_count = 0; start_it = blob_it; } if (colour == ScrollView::MAGENTA) colour = ScrollView::RED; else colour = (ScrollView::Color)(colour + 1); if (blob_box.left() - prev_x < row->min_space) { if (blob_box.left() - prev_x > row->space_threshold) rect_colour = ScrollView::GOLDENROD; else rect_colour = ScrollView::CORAL; //fill_color_index(win, rect_colour); win->Brush(rect_colour); win->Rectangle(prev_x, blob_box.bottom(), blob_box.left(), blob_box.top()); } } if (!blob->joined_to_prev()) prev_x = blob_box.right(); if (blob->cblob() != NULL) blob->cblob()->plot(win, colour, colour); if (!blob->joined_to_prev() && blob->cblob() != NULL) blob_count++; } if (pitch > 0 && textord_show_fixed_cuts && blob_count > 0) plot_fp_cells(win, colour, &start_it, pitch, blob_count, &row->projection, row->projection_left, row->projection_right, row->xheight * textord_projection_scale); }