inT32 row_words2( //compute space size TO_BLOCK *block, //block it came from TO_ROW *row, //row to operate on inT32 maxwidth, //max expected space size FCOORD rotation, //for drawing BOOL8 testing_on //for debug ) { BOOL8 testing_row; //contains testpt BOOL8 prev_valid; //if decent size BOOL8 this_valid; //current blob big enough inT32 prev_x; //end of prev blob inT32 min_width; //min interesting width inT32 valid_count; //good gaps inT32 total_count; //total gaps inT32 cluster_count; //no of clusters inT32 prev_count; //previous cluster_count inT32 gap_index; //which cluster inT32 smooth_factor; //for smoothing stats BLOBNBOX *blob; //current blob float lower, upper; //clustering parameters ICOORD testpt; TBOX blob_box; //bounding box //iterator BLOBNBOX_IT blob_it = row->blob_list (); STATS gap_stats (0, maxwidth); //gap sizes float gaps[BLOCK_STATS_CLUSTERS]; STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1]; //clusters testpt = ICOORD (textord_test_x, textord_test_y); smooth_factor = (inT32) (block->xheight * textord_wordstats_smooth_factor + 1.5); // if (testing_on) // tprintf("Row smooth factor=%d\n",smooth_factor); prev_valid = FALSE; prev_x = -MAX_INT16; testing_row = FALSE; //min blob size min_width = (inT32) block->pr_space; total_count = 0; for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (!blob->joined_to_prev ()) { blob_box = blob->bounding_box (); this_valid = blob_box.width () >= min_width; this_valid = TRUE; if (this_valid && prev_valid && blob_box.left () - prev_x < maxwidth) { gap_stats.add (blob_box.left () - prev_x, 1); } total_count++; //count possibles prev_x = blob_box.right (); prev_valid = this_valid; } } valid_count = gap_stats.get_total (); if (valid_count < total_count * textord_words_minlarge) { gap_stats.clear (); prev_x = -MAX_INT16; for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (!blob->joined_to_prev ()) { blob_box = blob->bounding_box (); if (blob_box.left () - prev_x < maxwidth) { gap_stats.add (blob_box.left () - prev_x, 1); } prev_x = blob_box.right (); } } } if (gap_stats.get_total () == 0) { row->min_space = 0; //no evidence row->max_nonspace = 0; return 0; } cluster_count = 0; lower = block->xheight * words_initial_lower; upper = block->xheight * words_initial_upper; gap_stats.smooth (smooth_factor); do { prev_count = cluster_count; cluster_count = gap_stats.cluster (lower, upper, textord_spacesize_ratioprop, BLOCK_STATS_CLUSTERS, cluster_stats); } while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS); if (cluster_count < 1) { row->min_space = 0; row->max_nonspace = 0; return 0; } for (gap_index = 0; gap_index < cluster_count; gap_index++) gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5); //get medians if (testing_on) { tprintf ("cluster_count=%d:", cluster_count); for (gap_index = 0; gap_index < cluster_count; gap_index++) tprintf (" %g(%d)", gaps[gap_index], cluster_stats[gap_index + 1].get_total ()); tprintf ("\n"); } //Try to find proportional non-space and space for row. for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] > block->max_nonspace; gap_index++); if (gap_index < cluster_count) lower = gaps[gap_index]; //most frequent below else { if (testing_on) tprintf ("No cluster below block threshold!, using default=%g\n", block->pr_nonsp); lower = block->pr_nonsp; } for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] <= block->max_nonspace; gap_index++); if (gap_index < cluster_count) upper = gaps[gap_index]; //most frequent above else { if (testing_on) tprintf ("No cluster above block threshold!, using default=%g\n", block->pr_space); upper = block->pr_space; } row->min_space = (inT32) ceil (upper - (upper - lower) * textord_words_definite_spread); row->max_nonspace = (inT32) floor (lower + (upper - lower) * textord_words_definite_spread); row->space_threshold = (row->max_nonspace + row->min_space) / 2; row->space_size = upper; row->kern_size = lower; if (testing_on) { if (testing_row) { tprintf ("GAP STATS\n"); gap_stats.print (stdout, TRUE); tprintf ("SPACE stats\n"); cluster_stats[2].print (stdout, FALSE); tprintf ("NONSPACE stats\n"); cluster_stats[1].print (stdout, FALSE); } tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept (), row->min_space, upper, row->max_nonspace, lower); } return 1; }
#include "mfcpch.h" #include <string.h> #ifdef __UNIX__ #include <assert.h> #endif #include "coutln.h" #include <allheaders.h> // Include automatically generated configuration file if running autoconf. #ifdef HAVE_CONFIG_H #include "config_auto.h" #endif ELISTIZE (C_OUTLINE) ICOORD C_OUTLINE::step_coords[4] = { ICOORD (-1, 0), ICOORD (0, -1), ICOORD (1, 0), ICOORD (0, 1) }; /********************************************************************** * C_OUTLINE::C_OUTLINE * * Constructor to build a C_OUTLINE from a CRACKEDGE LOOP. **********************************************************************/ C_OUTLINE::C_OUTLINE ( //constructor CRACKEDGE * startpt, //outline to convert ICOORD bot_left, //bounding box ICOORD top_right, inT16 length //length of loop ):box (bot_left, top_right), start (startpt->pos) { inT16 stepindex; //index to step
inT32 row_words( //compute space size TO_BLOCK *block, //block it came from TO_ROW *row, //row to operate on inT32 maxwidth, //max expected space size FCOORD rotation, //for drawing BOOL8 testing_on //for debug ) { BOOL8 testing_row; //contains testpt BOOL8 prev_valid; //if decent size BOOL8 this_valid; //current blob big enough inT32 prev_x; //end of prev blob inT32 min_gap; //min interesting gap inT32 cluster_count; //no of clusters inT32 gap_index; //which cluster inT32 smooth_factor; //for smoothing stats BLOBNBOX *blob; //current blob float lower, upper; //clustering parameters float gaps[3]; //gap clusers ICOORD testpt; TBOX blob_box; //bounding box //iterator BLOBNBOX_IT blob_it = row->blob_list (); STATS gap_stats (0, maxwidth); STATS cluster_stats[4]; //clusters testpt = ICOORD (textord_test_x, textord_test_y); smooth_factor = (inT32) (block->xheight * textord_wordstats_smooth_factor + 1.5); // if (testing_on) // tprintf("Row smooth factor=%d\n",smooth_factor); prev_valid = FALSE; prev_x = -MAX_INT32; testing_row = FALSE; for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); blob_box = blob->bounding_box (); if (blob_box.contains (testpt)) testing_row = TRUE; gap_stats.add (blob_box.width (), 1); } min_gap = (inT32) floor (gap_stats.ile (textord_words_width_ile)); gap_stats.clear (); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (!blob->joined_to_prev ()) { blob_box = blob->bounding_box (); // this_valid=blob_box.width()>=min_gap; this_valid = TRUE; if (this_valid && prev_valid && blob_box.left () - prev_x < maxwidth) { gap_stats.add (blob_box.left () - prev_x, 1); } prev_x = blob_box.right (); prev_valid = this_valid; } } if (gap_stats.get_total () == 0) { row->min_space = 0; //no evidence row->max_nonspace = 0; return 0; } gap_stats.smooth (smooth_factor); lower = row->xheight * textord_words_initial_lower; upper = row->xheight * textord_words_initial_upper; cluster_count = gap_stats.cluster (lower, upper, textord_spacesize_ratioprop, 3, cluster_stats); while (cluster_count < 2 && ceil (lower) < floor (upper)) { //shrink gap upper = (upper * 3 + lower) / 4; lower = (lower * 3 + upper) / 4; cluster_count = gap_stats.cluster (lower, upper, textord_spacesize_ratioprop, 3, cluster_stats); } if (cluster_count < 2) { row->min_space = 0; //no evidence row->max_nonspace = 0; return 0; } for (gap_index = 0; gap_index < cluster_count; gap_index++) gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5); //get medians if (cluster_count > 2) { if (testing_on && textord_show_initial_words) { tprintf ("Row at %g has 3 sizes of gap:%g,%g,%g\n", row->intercept (), cluster_stats[1].ile (0.5), cluster_stats[2].ile (0.5), cluster_stats[3].ile (0.5)); } lower = gaps[0]; if (gaps[1] > lower) { upper = gaps[1]; //prefer most frequent if (upper < block->xheight * textord_words_min_minspace && gaps[2] > gaps[1]) { upper = gaps[2]; } } else if (gaps[2] > lower && gaps[2] >= block->xheight * textord_words_min_minspace) upper = gaps[2]; else if (lower >= block->xheight * textord_words_min_minspace) { upper = lower; //not nice lower = gaps[1]; if (testing_on && textord_show_initial_words) { tprintf ("Had to switch most common from lower to upper!!\n"); gap_stats.print (stdout, TRUE); } } else { row->min_space = 0; //no evidence row->max_nonspace = 0; return 0; } } else { if (gaps[1] < gaps[0]) { if (testing_on && textord_show_initial_words) { tprintf ("Had to switch most common from lower to upper!!\n"); gap_stats.print (stdout, TRUE); } lower = gaps[1]; upper = gaps[0]; } else { upper = gaps[1]; lower = gaps[0]; } } if (upper < block->xheight * textord_words_min_minspace) { row->min_space = 0; //no evidence row->max_nonspace = 0; return 0; } if (upper * 3 < block->min_space * 2 + block->max_nonspace || lower * 3 > block->min_space * 2 + block->max_nonspace) { if (testing_on && textord_show_initial_words) { tprintf ("Disagreement between block and row at %g!!\n", row->intercept ()); tprintf ("Lower=%g, upper=%g, Stats:\n", lower, upper); gap_stats.print (stdout, TRUE); } } row->min_space = (inT32) ceil (upper - (upper - lower) * textord_words_definite_spread); row->max_nonspace = (inT32) floor (lower + (upper - lower) * textord_words_definite_spread); row->space_threshold = (row->max_nonspace + row->min_space) / 2; row->space_size = upper; row->kern_size = lower; if (testing_on && textord_show_initial_words) { if (testing_row) { tprintf ("GAP STATS\n"); gap_stats.print (stdout, TRUE); tprintf ("SPACE stats\n"); cluster_stats[2].print (stdout, FALSE); tprintf ("NONSPACE stats\n"); cluster_stats[1].print (stdout, FALSE); } tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept (), row->min_space, upper, row->max_nonspace, lower); } return cluster_stats[2].get_total (); }
void BLOBNBOX::chop( //chop blobs BLOBNBOX_IT *start_it, //location of this BLOBNBOX_IT *end_it, //iterator FCOORD rotation, //for landscape float xheight //of line ) { inT16 blobcount; //no of blobs BLOBNBOX *newblob; //fake blob BLOBNBOX *blob; //current blob inT16 blobindex; //number of chop inT16 leftx; //left edge of blob float blobwidth; //width of each float rightx; //right edge to scan float ymin, ymax; //limits of new blob float test_ymin, test_ymax; //limits of part blob ICOORD bl, tr; //corners of box BLOBNBOX_IT blob_it; //blob iterator //get no of chops blobcount = (inT16) floor (box.width () / xheight); if (blobcount > 1 && (blob_ptr != NULL || cblob_ptr != NULL)) { //width of each blobwidth = (float) (box.width () + 1) / blobcount; for (blobindex = blobcount - 1, rightx = box.right (); blobindex >= 0; blobindex--, rightx -= blobwidth) { ymin = (float) MAX_INT32; ymax = (float) -MAX_INT32; blob_it = *start_it; do { blob = blob_it.data (); if (blob->blob_ptr != NULL) find_blob_limits (blob->blob_ptr, rightx - blobwidth, rightx, rotation, test_ymin, test_ymax); else find_cblob_vlimits (blob->cblob_ptr, rightx - blobwidth, rightx, /*rotation, */ test_ymin, test_ymax); blob_it.forward (); if (test_ymin < ymin) ymin = test_ymin; if (test_ymax > ymax) ymax = test_ymax; } while (blob != end_it->data ()); if (ymin < ymax) { leftx = (inT16) floor (rightx - blobwidth); if (leftx < box.left ()) leftx = box.left (); //clip to real box bl = ICOORD (leftx, (inT16) floor (ymin)); tr = ICOORD ((inT16) ceil (rightx), (inT16) ceil (ymax)); if (blobindex == 0) box = TBOX (bl, tr); //change box else { newblob = new BLOBNBOX; //box is all it has newblob->box = TBOX (bl, tr); //stay on current end_it->add_after_stay_put (newblob); } } } } }