inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list) { WERD_RES_IT word_it(&word_res_list); WERD_RES *word; PBLOB_IT blob_it; inT16 word_length; inT16 score = 0; inT16 i; inT16 offset; const char *chs; float small_limit = bln_x_height * fixsp_small_outlines_size; if (!fixsp_fp_eval) return (eval_word_spacing (word_res_list)); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); word_length = word->reject_map.length (); chs = word->best_choice->string ().string (); if ((word->done || word->tess_accepted) || (word->best_choice->permuter () == SYSTEM_DAWG_PERM) || (word->best_choice->permuter () == FREQ_DAWG_PERM) || (word->best_choice->permuter () == USER_DAWG_PERM) || (safe_dict_word (chs) > 0)) { blob_it.set_to_list (word->outword->blob_list ()); for (i = 0, offset = 0; i < word_length; offset += word->best_choice->lengths()[i++], blob_it.forward ()) { if ((chs[offset] == ' ') || (blob_noise_score (blob_it.data ()) < small_limit)) score -= 1; //penalise possibly erroneous non-space else if (word->reject_map[i].accepted ()) score++; } } } if (score < 0) score = 0; return score; }
inT16 Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) { WERD_RES_IT word_it(&word_res_list); WERD_RES *word; inT16 word_length; inT16 score = 0; inT16 i; float small_limit = kBlnXHeight * fixsp_small_outlines_size; for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { word = word_it.data(); if (word->rebuild_word == NULL) continue; // Can't handle cube words. word_length = word->reject_map.length(); if (word->done || word->tess_accepted || word->best_choice->permuter() == SYSTEM_DAWG_PERM || word->best_choice->permuter() == FREQ_DAWG_PERM || word->best_choice->permuter() == USER_DAWG_PERM || safe_dict_word(word) > 0) { TBLOB* blob = word->rebuild_word->blobs; UNICHAR_ID space = word->uch_set->unichar_to_id(" "); for (i = 0; i < word->best_choice->length() && blob != NULL; ++i, blob = blob->next) { if (word->best_choice->unichar_id(i) == space || blob_noise_score(blob) < small_limit) { score -= 1; // penalise possibly erroneous non-space } else if (word->reject_map[i].accepted()) { score++; } } } } if (score < 0) score = 0; return score; }
inT16 Tesseract::worst_noise_blob(WERD_RES *word_res, float *worst_noise_score) { float noise_score[512]; int i; int min_noise_blob; // 1st contender int max_noise_blob; // last contender int non_noise_count; int worst_noise_blob; // Worst blob float small_limit = kBlnXHeight * fixsp_small_outlines_size; float non_noise_limit = kBlnXHeight * 0.8; if (word_res->rebuild_word == NULL) return -1; // Can't handle cube words. TBLOB* blob = word_res->rebuild_word->blobs; // Normalised. int blob_count = word_res->box_word->length(); ASSERT_HOST(blob_count <= 512); if (blob_count < 5) return -1; // too short to split /* Get the noise scores for all blobs */ #ifndef SECURE_NAMES if (debug_fix_space_level > 5) tprintf("FP fixspace Noise metrics for \"%s\": ", word_res->best_choice->unichar_string().string()); #endif for (i = 0; i < blob_count && blob != NULL; i++, blob = blob->next) { if (word_res->reject_map[i].accepted()) noise_score[i] = non_noise_limit; else noise_score[i] = blob_noise_score(blob); if (debug_fix_space_level > 5) tprintf("%1.1f ", noise_score[i]); } if (debug_fix_space_level > 5) tprintf("\n"); /* Now find the worst one which is far enough away from the end of the word */ non_noise_count = 0; for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) { if (noise_score[i] >= non_noise_limit) { non_noise_count++; } } if (non_noise_count < fixsp_non_noise_limit) return -1; min_noise_blob = i; non_noise_count = 0; for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit; i--) { if (noise_score[i] >= non_noise_limit) { non_noise_count++; } } if (non_noise_count < fixsp_non_noise_limit) return -1; max_noise_blob = i; if (min_noise_blob > max_noise_blob) return -1; *worst_noise_score = small_limit; worst_noise_blob = -1; for (i = min_noise_blob; i <= max_noise_blob; i++) { if (noise_score[i] < *worst_noise_score) { worst_noise_blob = i; *worst_noise_score = noise_score[i]; } } return worst_noise_blob; }
inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score) { PBLOB_IT blob_it; inT16 blob_count; float noise_score[512]; int i; int min_noise_blob; //1st contender int max_noise_blob; //last contender int non_noise_count; int worst_noise_blob; //Worst blob float small_limit = bln_x_height * fixsp_small_outlines_size; float non_noise_limit = bln_x_height * 0.8; blob_it.set_to_list (word_res->outword->blob_list ()); //normalised blob_count = blob_it.length (); ASSERT_HOST (blob_count <= 512); if (blob_count < 5) return -1; //too short to split /* Get the noise scores for all blobs */ #ifndef SECURE_NAMES if (debug_fix_space_level > 5) tprintf ("FP fixspace Noise metrics for \"%s\": ", word_res->best_choice->string ().string ()); #endif for (i = 0; i < blob_count; i++, blob_it.forward ()) { if (word_res->reject_map[i].accepted ()) noise_score[i] = non_noise_limit; else noise_score[i] = blob_noise_score (blob_it.data ()); if (debug_fix_space_level > 5) tprintf ("%1.1f ", noise_score[i]); } if (debug_fix_space_level > 5) tprintf ("\n"); /* Now find the worst one which is far enough away from the end of the word */ non_noise_count = 0; for (i = 0; (i < blob_count) && (non_noise_count < fixsp_non_noise_limit); i++) { if (noise_score[i] >= non_noise_limit) non_noise_count++; } if (non_noise_count < fixsp_non_noise_limit) return -1; min_noise_blob = i; non_noise_count = 0; for (i = blob_count - 1; (i >= 0) && (non_noise_count < fixsp_non_noise_limit); i--) { if (noise_score[i] >= non_noise_limit) non_noise_count++; } if (non_noise_count < fixsp_non_noise_limit) return -1; max_noise_blob = i; if (min_noise_blob > max_noise_blob) return -1; *worst_noise_score = small_limit; worst_noise_blob = -1; for (i = min_noise_blob; i <= max_noise_blob; i++) { if (noise_score[i] < *worst_noise_score) { worst_noise_blob = i; *worst_noise_score = noise_score[i]; } } return worst_noise_blob; }