/********************************************************************** * render_segmentation * * Create a list of line segments that represent the list of chunks * using the correct segmentation that was supplied as input. **********************************************************************/ void render_segmentation(ScrollView *window, TBLOB *chunks, SEARCH_STATE segmentation) { TBLOB *blob; C_COL color = Black; int char_num = -1; int chunks_left = 0; TBOX bbox; if (chunks) bbox = chunks->bounding_box(); for (blob = chunks; blob != NULL; blob = blob->next) { bbox += blob->bounding_box(); if (chunks_left-- == 0) { color = color_list[++char_num % NUM_COLORS]; if (char_num < segmentation[0]) chunks_left = segmentation[char_num + 1]; else chunks_left = MAX_INT32; } render_outline(window, blob->outlines, color); } window->ZoomToRectangle(bbox.left(), bbox.top(), bbox.right(), bbox.bottom()); }
/********************************************************************** * blobs_widths * * Compute the widths of a list of blobs. Return an array of the widths * and gaps. **********************************************************************/ WIDTH_RECORD *blobs_widths(TBLOB *blobs) { /*blob to compute on */ WIDTH_RECORD *width_record; TPOINT topleft; /*bounding box */ TPOINT botright; int i = 0; int blob_end; int num_blobs = count_blobs (blobs); /* Get memory */ width_record = (WIDTH_RECORD *) memalloc (sizeof (int) * num_blobs * 2); width_record->num_chars = num_blobs; TBOX bbox = blobs->bounding_box(); width_record->widths[i++] = bbox.width(); /* First width */ blob_end = bbox.right(); for (TBLOB* blob = blobs->next; blob != NULL; blob = blob->next) { TBOX curbox = blob->bounding_box(); width_record->widths[i++] = curbox.left() - blob_end; width_record->widths[i++] = curbox.width(); blob_end = curbox.right(); } return width_record; }
void TWERD::plot(ScrollView* window) { ScrollView::Color color = WERD::NextColor(ScrollView::BLACK); for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) { blob->plot(window, color, ScrollView::BROWN); color = WERD::NextColor(color); } }
// Returns the number of misfit blob tops in this word. int Tesseract::CountMisfitTops(WERD_RES *word_res) { int bad_blobs = 0; TBLOB* blob = word_res->rebuild_word->blobs; int blob_id = 0; for (; blob != NULL; blob = blob->next, ++blob_id) { UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id); if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) { int top = blob->bounding_box().top(); if (top >= INT_FEAT_RANGE) top = INT_FEAT_RANGE - 1; int min_bottom, max_bottom, min_top, max_top; unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top); if (max_top - min_top > kMaxCharTopRange) continue; bool bad = top < min_top - x_ht_acceptance_tolerance || top > max_top + x_ht_acceptance_tolerance; if (bad) ++bad_blobs; if (debug_x_ht_level >= 1) { tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n", unicharset.id_to_unichar(class_id), bad ? "Misfit" : "OK", top, min_top, max_top, static_cast<int>(x_ht_acceptance_tolerance)); } } } return bad_blobs; }
/** * @name chop_one_blob * * Start with the current one-blob word and its classification. Find * the worst blobs and try to divide it up to improve the ratings. * Used for testing chopper. */ bool Wordrec::chop_one_blob(TWERD *word, BLOB_CHOICE_LIST_VECTOR *char_choices, inT32 *blob_number, SEAMS *seam_list, int *right_chop_index) { TBLOB *blob; inT16 x = 0; float rating_ceiling = MAX_FLOAT32; BLOB_CHOICE_LIST *answer; BLOB_CHOICE_IT answer_it; SEAM *seam; UNICHAR_ID unichar_id = 0; int left_chop_index = 0; do { *blob_number = select_blob_to_split(*char_choices, rating_ceiling, false); if (chop_debug) cprintf("blob_number = %d\n", *blob_number); if (*blob_number == -1) return false; seam = attempt_blob_chop(word, *blob_number, true, *seam_list); if (seam != NULL) break; /* Must split null blobs */ answer = char_choices->get(*blob_number); if (answer == NULL) return false; answer_it.set_to_list(answer); rating_ceiling = answer_it.data()->rating(); // try a different blob } while (true); /* Split OK */ for (blob = word->blobs; x < *blob_number; x++) { blob = blob->next; } if (chop_debug) { tprintf("Chop made blob1:"); blob->bounding_box().print(); tprintf("and blob2:"); blob->next->bounding_box().print(); } *seam_list = insert_seam(*seam_list, *blob_number, seam, blob, word->blobs); answer = char_choices->get(*blob_number); answer_it.set_to_list(answer); unichar_id = answer_it.data()->unichar_id(); float rating = answer_it.data()->rating() / exp(1.0); left_chop_index = atoi(unicharset.id_to_unichar(unichar_id)); delete char_choices->get(*blob_number); // combine confidence w/ serial # answer = fake_classify_blob(0, rating, -rating); modify_blob_choice(answer, left_chop_index); char_choices->insert(answer, *blob_number); answer = fake_classify_blob(0, rating - 0.125f, -rating); modify_blob_choice(answer, ++*right_chop_index); char_choices->set(answer, *blob_number + 1); return true; }
TBOX TWERD::bounding_box() const { TBOX result; for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) { TBOX box = blob->bounding_box(); result += box; } return result; }
/********************************************************************** * record_blob_bounds * * Set up and initialize an array that holds the bounds of a set of * blobs. Caller should delete[] the array. **********************************************************************/ TBOX *Wordrec::record_blob_bounds(TBLOB *blobs) { int nblobs = count_blobs(blobs); TBOX *bboxes = new TBOX[nblobs]; inT16 x = 0; for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) { bboxes[x] = blob->bounding_box(); x++; } return bboxes; }
// Note: After running this function word_res->ratings // might not contain the right BLOB_CHOICE corresponding to each character // in word_res->best_choice. void Tesseract::flip_hyphens(WERD_RES *word_res) { WERD_CHOICE *best_choice = word_res->best_choice; int i; int prev_right = -9999; int next_left; TBOX out_box; float aspect_ratio; if (tessedit_lower_flip_hyphen <= 1) return; int num_blobs = word_res->rebuild_word->NumBlobs(); UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-"); for (i = 0; i < best_choice->length() && i < num_blobs; ++i) { TBLOB* blob = word_res->rebuild_word->blobs[i]; out_box = blob->bounding_box(); if (i + 1 == num_blobs) next_left = 9999; else next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left(); // Dont touch small or touching blobs - it is too dangerous. if ((out_box.width() > 8 * word_res->denorm.x_scale()) && (out_box.left() > prev_right) && (out_box.right() < next_left)) { aspect_ratio = out_box.width() / (float) out_box.height(); if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) { if (aspect_ratio >= tessedit_upper_flip_hyphen && word_res->uch_set->contains_unichar_id(unichar_dash) && word_res->uch_set->get_enabled(unichar_dash)) { /* Certain HYPHEN */ best_choice->set_unichar_id(unichar_dash, i); if (word_res->reject_map[i].rejected()) word_res->reject_map[i].setrej_hyphen_accept(); } if ((aspect_ratio > tessedit_lower_flip_hyphen) && word_res->reject_map[i].accepted()) //Suspected HYPHEN word_res->reject_map[i].setrej_hyphen (); } else if (best_choice->unichar_id(i) == unichar_dash) { if ((aspect_ratio >= tessedit_upper_flip_hyphen) && (word_res->reject_map[i].rejected())) word_res->reject_map[i].setrej_hyphen_accept(); //Certain HYPHEN if ((aspect_ratio <= tessedit_lower_flip_hyphen) && (word_res->reject_map[i].accepted())) //Suspected HYPHEN word_res->reject_map[i].setrej_hyphen(); } } prev_right = out_box.right(); } }
// Baseline normalizes the blobs in-place, recording the normalization in the // DENORMs in the blobs. void TWERD::BLNormalize(const BLOCK* block, const ROW* row, Pix* pix, bool inverse, float x_height, bool numeric_mode, tesseract::OcrEngineMode hint, const TBOX* norm_box, DENORM* word_denorm) { TBOX word_box = bounding_box(); if (norm_box != NULL) word_box = *norm_box; float word_middle = (word_box.left() + word_box.right()) / 2.0f; float input_y_offset = 0.0f; float final_y_offset = static_cast<float>(kBlnBaselineOffset); float scale = kBlnXHeight / x_height; if (hint == tesseract::OEM_CUBE_ONLY || row == NULL) { word_middle = word_box.left(); input_y_offset = word_box.bottom(); final_y_offset = 0.0f; if (hint == tesseract::OEM_CUBE_ONLY) scale = 1.0f; } else { input_y_offset = row->base_line(word_middle); } for (int b = 0; b < blobs.size(); ++b) { TBLOB* blob = blobs[b]; TBOX blob_box = blob->bounding_box(); float mid_x = (blob_box.left() + blob_box.right()) / 2.0f; float baseline = input_y_offset; float blob_scale = scale; if (numeric_mode) { baseline = blob_box.bottom(); blob_scale = ClipToRange(kBlnXHeight * 4.0f / (3 * blob_box.height()), scale, scale * 1.5f); } else if (row != NULL && hint != tesseract::OEM_CUBE_ONLY) { baseline = row->base_line(mid_x); } // The image will be 8-bit grey if the input was grey or color. Note that in // a grey image 0 is black and 255 is white. If the input was binary, then // the pix will be binary and 0 is white, with 1 being black. // To tell the difference pixGetDepth() will return 8 or 1. // The inverse flag will be true iff the word has been determined to be // white on black, and is independent of whether the pix is 8 bit or 1 bit. blob->Normalize(block, NULL, NULL, word_middle, baseline, blob_scale, blob_scale, 0.0f, final_y_offset, inverse, pix); } if (word_denorm != NULL) { word_denorm->SetupNormalization(block, NULL, NULL, word_middle, input_y_offset, scale, scale, 0.0f, final_y_offset); word_denorm->set_inverse(inverse); word_denorm->set_pix(pix); } }
// Extracts sets of 3-D features of length kStandardFeatureLength (=12.8), as // (x,y) position and angle as measured counterclockwise from the vector // <-1, 0>, from blob using two normalizations defined by bl_denorm and // cn_denorm. See SetpuBLCNDenorms for definitions. // If outline_cn_counts is not nullptr, on return it contains the cumulative // number of cn features generated for each outline in the blob (in order). // Thus after the first outline, there were (*outline_cn_counts)[0] features, // after the second outline, there were (*outline_cn_counts)[1] features etc. void Classify::ExtractFeatures(const TBLOB& blob, bool nonlinear_norm, GenericVector<INT_FEATURE_STRUCT>* bl_features, GenericVector<INT_FEATURE_STRUCT>* cn_features, INT_FX_RESULT_STRUCT* results, GenericVector<int>* outline_cn_counts) { DENORM bl_denorm, cn_denorm; tesseract::Classify::SetupBLCNDenorms(blob, nonlinear_norm, &bl_denorm, &cn_denorm, results); if (outline_cn_counts != nullptr) outline_cn_counts->truncate(0); // Iterate the outlines. for (TESSLINE* ol = blob.outlines; ol != nullptr; ol = ol->next) { // Iterate the polygon. EDGEPT* loop_pt = ol->FindBestStartPt(); EDGEPT* pt = loop_pt; if (pt == nullptr) continue; do { if (pt->IsHidden()) continue; // Find a run of equal src_outline. EDGEPT* last_pt = pt; do { last_pt = last_pt->next; } while (last_pt != loop_pt && !last_pt->IsHidden() && last_pt->src_outline == pt->src_outline); last_pt = last_pt->prev; // Until the adaptive classifier can be weaned off polygon segments, // we have to force extraction from the polygon for the bl_features. ExtractFeaturesFromRun(pt, last_pt, bl_denorm, kStandardFeatureLength, true, bl_features); ExtractFeaturesFromRun(pt, last_pt, cn_denorm, kStandardFeatureLength, false, cn_features); pt = last_pt; } while ((pt = pt->next) != loop_pt); if (outline_cn_counts != nullptr) outline_cn_counts->push_back(cn_features->size()); } results->NumBL = bl_features->size(); results->NumCN = cn_features->size(); results->YBottom = blob.bounding_box().bottom(); results->YTop = blob.bounding_box().top(); results->Width = blob.bounding_box().width(); }
// Displays the segmentation state of *this (if not the same as the last // one displayed) and waits for a click in the window. void WERD_CHOICE::DisplaySegmentation(TWERD* word) { #ifndef GRAPHICS_DISABLED // Number of different colors to draw with. const int kNumColors = 6; static ScrollView *segm_window = NULL; // Check the state against the static prev_drawn_state. static GenericVector<int> prev_drawn_state; bool already_done = prev_drawn_state.size() == length_; if (!already_done) prev_drawn_state.init_to_size(length_, 0); for (int i = 0; i < length_; ++i) { if (prev_drawn_state[i] != state_[i]) { already_done = false; } prev_drawn_state[i] = state_[i]; } if (already_done || word->blobs.empty()) return; // Create the window if needed. if (segm_window == NULL) { segm_window = new ScrollView("Segmentation", 5, 10, 500, 256, 2000.0, 256.0, true); } else { segm_window->Clear(); } TBOX bbox; int blob_index = 0; for (int c = 0; c < length_; ++c) { ScrollView::Color color = static_cast<ScrollView::Color>(c % kNumColors + 3); for (int i = 0; i < state_[c]; ++i, ++blob_index) { TBLOB* blob = word->blobs[blob_index]; bbox += blob->bounding_box(); blob->plot(segm_window, color, color); } } segm_window->ZoomToRectangle(bbox.left(), bbox.top(), bbox.right(), bbox.bottom()); segm_window->Update(); window_wait(segm_window); #endif }
SEAM *Wordrec::chop_overlapping_blob(const GenericVector<TBOX>& boxes, bool italic_blob, WERD_RES *word_res, int *blob_number) { TWERD *word = word_res->chopped_word; for (*blob_number = 0; *blob_number < word->NumBlobs(); ++*blob_number) { TBLOB *blob = word->blobs[*blob_number]; TPOINT topleft, botright; topleft.x = blob->bounding_box().left(); topleft.y = blob->bounding_box().top(); botright.x = blob->bounding_box().right(); botright.y = blob->bounding_box().bottom(); TPOINT original_topleft, original_botright; word_res->denorm.DenormTransform(NULL, topleft, &original_topleft); word_res->denorm.DenormTransform(NULL, botright, &original_botright); TBOX original_box = TBOX(original_topleft.x, original_botright.y, original_botright.x, original_topleft.y); bool almost_equal_box = false; int num_overlap = 0; for (int i = 0; i < boxes.size(); i++) { if (original_box.overlap_fraction(boxes[i]) > 0.125) num_overlap++; if (original_box.almost_equal(boxes[i], 3)) almost_equal_box = true; } TPOINT location; if (divisible_blob(blob, italic_blob, &location) || (!almost_equal_box && num_overlap > 1)) { SEAM *seam = attempt_blob_chop(word, blob, *blob_number, italic_blob, word_res->seam_array); if (seam != NULL) return seam; } } *blob_number = -1; return NULL; }
// Normalize in-place and record the normalization in the DENORM. void TWERD::SetupBLNormalize(const BLOCK* block, const ROW* row, float x_height, bool numeric_mode, DENORM* denorm) const { int num_segments = 0; DENORM_SEG* segs = NULL; if (numeric_mode) { segs = new DENORM_SEG[NumBlobs()]; for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) { TBOX blob_box = blob->bounding_box(); float factor = kBlnXHeight / x_height; factor = ClipToRange(kBlnXHeight * 4.0f / (3 * blob_box.height()), factor, factor * 1.5f); segs[num_segments].xstart = blob_box.left(); segs[num_segments].ycoord = blob_box.bottom(); segs[num_segments++].scale_factor = factor; } } denorm->SetupBLNormalize(block, row, x_height, bounding_box(), num_segments, segs); delete [] segs; }
// Computes the DENORMS for bl(baseline) and cn(character) normalization // during feature extraction. The input denorm describes the current state // of the blob, which is usually a baseline-normalized word. // The Transforms setup are as follows: // Baseline Normalized (bl) Output: // We center the grapheme by aligning the x-coordinate of its centroid with // x=128 and leaving the already-baseline-normalized y as-is. // // Character Normalized (cn) Output: // We align the grapheme's centroid at the origin and scale it // asymmetrically in x and y so that the 2nd moments are a standard value // (51.2) ie the result is vaguely square. // If classify_nonlinear_norm is true: // A non-linear normalization is setup that attempts to evenly distribute // edges across x and y. // // Some of the fields of fx_info are also setup: // Length: Total length of outline. // Rx: Rounded y second moment. (Reversed by convention.) // Ry: rounded x second moment. // Xmean: Rounded x center of mass of the blob. // Ymean: Rounded y center of mass of the blob. void Classify::SetupBLCNDenorms(const TBLOB& blob, bool nonlinear_norm, DENORM* bl_denorm, DENORM* cn_denorm, INT_FX_RESULT_STRUCT* fx_info) { // Compute 1st and 2nd moments of the original outline. FCOORD center, second_moments; int length = blob.ComputeMoments(¢er, &second_moments); if (fx_info != nullptr) { fx_info->Length = length; fx_info->Rx = IntCastRounded(second_moments.y()); fx_info->Ry = IntCastRounded(second_moments.x()); fx_info->Xmean = IntCastRounded(center.x()); fx_info->Ymean = IntCastRounded(center.y()); } // Setup the denorm for Baseline normalization. bl_denorm->SetupNormalization(nullptr, nullptr, &blob.denorm(), center.x(), 128.0f, 1.0f, 1.0f, 128.0f, 128.0f); // Setup the denorm for character normalization. if (nonlinear_norm) { GenericVector<GenericVector<int> > x_coords; GenericVector<GenericVector<int> > y_coords; TBOX box; blob.GetPreciseBoundingBox(&box); box.pad(1, 1); blob.GetEdgeCoords(box, &x_coords, &y_coords); cn_denorm->SetupNonLinear(&blob.denorm(), box, UINT8_MAX, UINT8_MAX, 0.0f, 0.0f, x_coords, y_coords); } else { cn_denorm->SetupNormalization(nullptr, nullptr, &blob.denorm(), center.x(), center.y(), 51.2f / second_moments.x(), 51.2f / second_moments.y(), 128.0f, 128.0f); } }
BLOB_CHOICE* M_Utils::runBlobOCR(BLOBNBOX* blob, Tesseract* ocrengine) { // * Normalize blob height to x-height (current OSD): // SetupNormalization(NULL, NULL, &rotation, NULL, NULL, 0, // box.rotational_x_middle(rotation), // box.rotational_y_middle(rotation), // kBlnXHeight / box.rotational_height(rotation), // kBlnXHeight / box.rotational_height(rotation), // 0, kBlnBaselineOffset); BLOB_CHOICE_LIST ratings_lang; C_BLOB* cblob = blob->cblob(); TBLOB* tblob = TBLOB::PolygonalCopy(cblob); const TBOX& box = tblob->bounding_box(); // Normalize the blob. Set the origin to the place we want to be the // bottom-middle, and scaling is to mpx, box_, NULL); float scaling = static_cast<float>(kBlnXHeight) / box.height(); DENORM denorm; float x_orig = (box.left() + box.right()) / 2.0f, y_orig = box.bottom(); denorm.SetupNormalization(NULL, NULL, NULL, NULL, NULL, 0, x_orig, y_orig, scaling, scaling, 0.0f, static_cast<float>(kBlnBaselineOffset)); TBLOB* normed_blob = new TBLOB(*tblob); normed_blob->Normalize(denorm); ocrengine->AdaptiveClassifier(normed_blob, denorm, &ratings_lang, NULL); delete normed_blob; delete tblob; // Get the best choice from ratings_lang and rating_equ. As the choice in the // list has already been sorted by the certainty, we simply use the first // choice. BLOB_CHOICE *lang_choice = NULL; if (ratings_lang.length() > 0) { BLOB_CHOICE_IT choice_it(&ratings_lang); lang_choice = choice_it.data(); } return lang_choice; }
// Normalize in-place and record the normalization in the DENORM. void TWERD::Normalize(ROW* row, float x_height, bool numeric_mode, DENORM* denorm) { TBOX word_box = bounding_box(); DENORM antidote((word_box.left() + word_box.right()) / 2.0, kBlnXHeight / x_height, row); if (row == NULL) { antidote = DENORM(antidote.origin(), antidote.scale(), 0.0, word_box.bottom(), 0, NULL, false, NULL); } int num_segments = 0; DENORM_SEG *segs = new DENORM_SEG[NumBlobs()]; for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) { TBOX blob_box = blob->bounding_box(); ICOORD translation(-static_cast<int>(floor(antidote.origin() + 0.5)), -blob_box.bottom()); float factor = antidote.scale(); if (numeric_mode) { factor = ClipToRange(kBlnXHeight * 4.0f / (3 * blob_box.height()), factor, factor * 1.5f); segs[num_segments].xstart = blob->bounding_box().left(); segs[num_segments].ycoord = blob_box.bottom(); segs[num_segments++].scale_factor = factor; } else { float blob_x_center = (blob_box.left() + blob_box.right()) / 2.0; float y_shift = antidote.yshift_at_orig_x(blob_x_center); translation.set_y(-static_cast<int>(floor(y_shift + 0.5))); } blob->Move(translation); blob->Scale(factor); blob->Move(ICOORD(0, kBlnBaselineOffset)); } if (num_segments > 0) { antidote.set_segments(segs, num_segments); } delete [] segs; if (denorm != NULL) *denorm = antidote; }
// Returns a new x-height maximally compatible with the result in word_res. // See comment above for overall algorithm. float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res) { STATS top_stats(0, MAX_UINT8); TBLOB* blob = word_res->rebuild_word->blobs; int blob_id = 0; for (; blob != NULL; blob = blob->next, ++blob_id) { UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id); if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) { int top = blob->bounding_box().top(); // Clip the top to the limit of normalized feature space. if (top >= INT_FEAT_RANGE) top = INT_FEAT_RANGE - 1; int bottom = blob->bounding_box().bottom(); int min_bottom, max_bottom, min_top, max_top; unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top); // Chars with a wild top range would mess up the result so ignore them. if (max_top - min_top > kMaxCharTopRange) continue; int misfit_dist = MAX((min_top - x_ht_acceptance_tolerance) - top, top - (max_top + x_ht_acceptance_tolerance)); int height = top - kBlnBaselineOffset; if (debug_x_ht_level >= 20) { tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d : ", unicharset.id_to_unichar(class_id), height, min_bottom, max_bottom, min_top, max_top, bottom, top); } // Use only chars that fit in the expected bottom range, and where // the range of tops is sensibly near the xheight. if (min_bottom <= bottom + x_ht_acceptance_tolerance && bottom - x_ht_acceptance_tolerance <= max_bottom && min_top > kBlnBaselineOffset && max_top - kBlnBaselineOffset >= kBlnXHeight && misfit_dist > 0) { // Compute the x-height position using proportionality between the // actual height and expected height. int min_xht = DivRounded(height * kBlnXHeight, max_top - kBlnBaselineOffset); int max_xht = DivRounded(height * kBlnXHeight, min_top - kBlnBaselineOffset); if (debug_x_ht_level >= 20) { tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht); } // The range of expected heights gets a vote equal to the distance // of the actual top from the expected top. for (int y = min_xht; y <= max_xht; ++y) top_stats.add(y, misfit_dist); } else if (debug_x_ht_level >= 20) { tprintf(" already OK\n"); } } } if (top_stats.get_total() == 0) return 0.0f; // The new xheight is just the median vote, which is then scaled out // of BLN space back to pixel space to get the x-height in pixel space. float new_xht = top_stats.median(); if (debug_x_ht_level >= 20) { tprintf("Median xht=%f\n", new_xht); tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n", new_xht, new_xht / word_res->denorm.y_scale()); } // The xheight must change by at least x_ht_min_change to be used. if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change) return new_xht / word_res->denorm.y_scale(); else return 0.0f; }
// Returns a new x-height maximally compatible with the result in word_res. // See comment above for overall algorithm. float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res, float* baseline_shift) { STATS top_stats(0, MAX_UINT8); STATS shift_stats(-MAX_UINT8, MAX_UINT8); int bottom_shift = 0; int num_blobs = word_res->rebuild_word->NumBlobs(); do { top_stats.clear(); shift_stats.clear(); for (int blob_id = 0; blob_id < num_blobs; ++blob_id) { TBLOB* blob = word_res->rebuild_word->blobs[blob_id]; UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id); if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) { int top = blob->bounding_box().top() + bottom_shift; // Clip the top to the limit of normalized feature space. if (top >= INT_FEAT_RANGE) top = INT_FEAT_RANGE - 1; int bottom = blob->bounding_box().bottom() + bottom_shift; int min_bottom, max_bottom, min_top, max_top; unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, &min_top, &max_top); // Chars with a wild top range would mess up the result so ignore them. if (max_top - min_top > kMaxCharTopRange) continue; int misfit_dist = MAX((min_top - x_ht_acceptance_tolerance) - top, top - (max_top + x_ht_acceptance_tolerance)); int height = top - kBlnBaselineOffset; if (debug_x_ht_level >= 2) { tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ", unicharset.id_to_unichar(class_id), height, min_bottom, max_bottom, min_top, max_top, bottom, top); } // Use only chars that fit in the expected bottom range, and where // the range of tops is sensibly near the xheight. if (min_bottom <= bottom + x_ht_acceptance_tolerance && bottom - x_ht_acceptance_tolerance <= max_bottom && min_top > kBlnBaselineOffset && max_top - kBlnBaselineOffset >= kBlnXHeight && misfit_dist > 0) { // Compute the x-height position using proportionality between the // actual height and expected height. int min_xht = DivRounded(height * kBlnXHeight, max_top - kBlnBaselineOffset); int max_xht = DivRounded(height * kBlnXHeight, min_top - kBlnBaselineOffset); if (debug_x_ht_level >= 2) { tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht); } // The range of expected heights gets a vote equal to the distance // of the actual top from the expected top. for (int y = min_xht; y <= max_xht; ++y) top_stats.add(y, misfit_dist); } else if ((min_bottom > bottom + x_ht_acceptance_tolerance || bottom - x_ht_acceptance_tolerance > max_bottom) && bottom_shift == 0) { // Get the range of required bottom shift. int min_shift = min_bottom - bottom; int max_shift = max_bottom - bottom; if (debug_x_ht_level >= 2) { tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift); } // The range of expected shifts gets a vote equal to the min distance // of the actual bottom from the expected bottom, spread over the // range of its acceptance. int misfit_weight = abs(min_shift); if (max_shift > min_shift) misfit_weight /= max_shift - min_shift; for (int y = min_shift; y <= max_shift; ++y) shift_stats.add(y, misfit_weight); } else { if (bottom_shift == 0) { // Things with bottoms that are already ok need to say so, on the // 1st iteration only. shift_stats.add(0, kBlnBaselineOffset); } if (debug_x_ht_level >= 2) { tprintf(" already OK\n"); } } } } if (shift_stats.get_total() > top_stats.get_total()) { bottom_shift = IntCastRounded(shift_stats.median()); if (debug_x_ht_level >= 2) { tprintf("Applying bottom shift=%d\n", bottom_shift); } } } while (bottom_shift != 0 && top_stats.get_total() < shift_stats.get_total()); // Baseline shift is opposite sign to the bottom shift. *baseline_shift = -bottom_shift / word_res->denorm.y_scale(); if (debug_x_ht_level >= 2) { tprintf("baseline shift=%g\n", *baseline_shift); } if (top_stats.get_total() == 0) return bottom_shift != 0 ? word_res->x_height : 0.0f; // The new xheight is just the median vote, which is then scaled out // of BLN space back to pixel space to get the x-height in pixel space. float new_xht = top_stats.median(); if (debug_x_ht_level >= 2) { tprintf("Median xht=%f\n", new_xht); tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n", new_xht, new_xht / word_res->denorm.y_scale()); } // The xheight must change by at least x_ht_min_change to be used. if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change) return new_xht / word_res->denorm.y_scale(); else return bottom_shift != 0 ? word_res->x_height : 0.0f; }
/** * @name uniformly_spaced() * Return true if one of the following are true: * - All inter-char gaps are the same width * - The largest gap is no larger than twice the mean/median of the others * - The largest gap is < normalised_max_nonspace * **** REMEMBER - WE'RE NOW WORKING WITH A BLN WERD !!! */ BOOL8 Tesseract::uniformly_spaced(WERD_RES *word) { TBOX box; inT16 prev_right = -MAX_INT16; inT16 gap; inT16 max_gap = -MAX_INT16; inT16 max_gap_count = 0; STATS gap_stats(0, MAXSPACING); BOOL8 result; const ROW *row = word->denorm.row(); float max_non_space; float normalised_max_nonspace; inT16 i = 0; inT16 offset = 0; STRING punct_chars = "\"`',.:;"; for (TBLOB* blob = word->rebuild_word->blobs; blob != NULL; blob = blob->next) { box = blob->bounding_box(); if ((prev_right > -MAX_INT16) && (!punct_chars.contains( word->best_choice->unichar_string() [offset - word->best_choice->unichar_lengths()[i - 1]]) && !punct_chars.contains( word->best_choice->unichar_string()[offset]))) { gap = box.left() - prev_right; if (gap < max_gap) { gap_stats.add(gap, 1); } else if (gap == max_gap) { max_gap_count++; } else { if (max_gap_count > 0) gap_stats.add(max_gap, max_gap_count); max_gap = gap; max_gap_count = 1; } } prev_right = box.right(); offset += word->best_choice->unichar_lengths()[i++]; } max_non_space = (row->space() + 3 * row->kern()) / 4; normalised_max_nonspace = max_non_space * kBlnXHeight / row->x_height(); result = ( gap_stats.get_total() == 0 || max_gap <= normalised_max_nonspace || (gap_stats.get_total() > 2 && max_gap <= 2 * gap_stats.median()) || (gap_stats.get_total() <= 2 && max_gap <= 2 * gap_stats.mean())); #ifndef SECURE_NAMES if ((debug_fix_space_level > 1)) { if (result) { tprintf( "ACCEPT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d " "total=%d mean=%f median=%f\n", word->best_choice->unichar_string().string(), normalised_max_nonspace, max_gap, max_gap_count, gap_stats.get_total(), gap_stats.mean(), gap_stats.median()); } else { tprintf( "REJECT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d " "total=%d mean=%f median=%f\n", word->best_choice->unichar_string().string(), normalised_max_nonspace, max_gap, max_gap_count, gap_stats.get_total(), gap_stats.mean(), gap_stats.median()); } } #endif return result; }
// Returns true if the blob is small enough to be a large speckle. bool Classify::LargeSpeckle(const TBLOB &blob) { double speckle_size = kBlnXHeight * speckle_large_max_size; TBOX bbox = blob.bounding_box(); return bbox.width() < speckle_size && bbox.height() < speckle_size; }
// Returns true if *this SPLIT appears OK in the sense that it does not cross // any outlines and does not chop off any ridiculously small pieces. bool SPLIT::IsHealthy(const TBLOB& blob, int min_points, int min_area) const { return !IsLittleChunk(min_points, min_area) && !blob.SegmentCrossesOutline(point1->pos, point2->pos); }
// Recomputes the bounding boxes of the blobs. void TWERD::ComputeBoundingBoxes() { for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) { blob->ComputeBoundingBoxes(); } }
// Sets up the script_pos_ member using the blobs_list to get the bln // bounding boxes, *this to get the unichars, and this->unicharset // to get the target positions. If small_caps is true, sub/super are not // considered, but dropcaps are. // NOTE: blobs_list should be the chopped_word blobs. (Fully segemented.) void WERD_CHOICE::SetScriptPositions(bool small_caps, TWERD* word) { // Since WERD_CHOICE isn't supposed to depend on a Tesseract, // we don't have easy access to the flags Tesseract stores. Therefore, debug // for this module is hard compiled in. int debug = 0; // Initialize to normal. for (int i = 0; i < length_; ++i) script_pos_[i] = tesseract::SP_NORMAL; if (word->blobs.empty() || word->NumBlobs() != TotalOfStates()) { return; } int position_counts[4]; for (int i = 0; i < 4; i++) { position_counts[i] = 0; } int chunk_index = 0; for (int blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) { TBLOB* tblob = word->blobs[chunk_index]; int uni_id = unichar_id(blob_index); TBOX blob_box = tblob->bounding_box(); if (state_ != NULL) { for (int i = 1; i < state_[blob_index]; ++i) { ++chunk_index; tblob = word->blobs[chunk_index]; blob_box += tblob->bounding_box(); } } script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box, uni_id); if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) { script_pos_[blob_index] = tesseract::SP_NORMAL; } position_counts[script_pos_[blob_index]]++; } // If almost everything looks like a superscript or subscript, // we most likely just got the baseline wrong. if (position_counts[tesseract::SP_SUBSCRIPT] > 0.75 * length_ || position_counts[tesseract::SP_SUPERSCRIPT] > 0.75 * length_) { if (debug >= 2) { tprintf("Most characters of %s are subscript or superscript.\n" "That seems wrong, so I'll assume we got the baseline wrong\n", unichar_string().string()); } for (int i = 0; i < length_; i++) { ScriptPos sp = script_pos_[i]; if (sp == tesseract::SP_SUBSCRIPT || sp == tesseract::SP_SUPERSCRIPT) { position_counts[sp]--; position_counts[tesseract::SP_NORMAL]++; script_pos_[i] = tesseract::SP_NORMAL; } } } if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) || debug >= 2) { tprintf("SetScriptPosition on %s\n", unichar_string().string()); int chunk_index = 0; for (int blob_index = 0; blob_index < length_; ++blob_index) { if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) { TBLOB* tblob = word->blobs[chunk_index]; ScriptPositionOf(true, *unicharset_, tblob->bounding_box(), unichar_id(blob_index)); } chunk_index += state_ != NULL ? state_[blob_index] : 1; } } }
// Normalize in-place using the DENORM. void TWERD::Normalize(const DENORM& denorm) { for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) { blob->Normalize(denorm); } }
// Note: After running this function word_res->ratings // might not contain the right BLOB_CHOICE corresponding to each character // in word_res->best_choice. void Tesseract::flip_0O(WERD_RES *word_res) { WERD_CHOICE *best_choice = word_res->best_choice; int i; TBOX out_box; if (!tessedit_flip_0O) return; int num_blobs = word_res->rebuild_word->NumBlobs(); for (i = 0; i < best_choice->length() && i < num_blobs; ++i) { TBLOB* blob = word_res->rebuild_word->blobs[i]; if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) || word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) { out_box = blob->bounding_box(); if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) || (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4)) return; //Beware words with sub/superscripts } } UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0"); UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O"); if (unichar_0 == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_0) || unichar_O == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_O)) { return; // 0 or O are not present/enabled in unicharset } for (i = 1; i < best_choice->length(); ++i) { if (best_choice->unichar_id(i) == unichar_0 || best_choice->unichar_id(i) == unichar_O) { /* A0A */ if ((i+1) < best_choice->length() && non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) && non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) { best_choice->set_unichar_id(unichar_O, i); } /* A00A */ if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) && (i+1) < best_choice->length() && (best_choice->unichar_id(i+1) == unichar_0 || best_choice->unichar_id(i+1) == unichar_O) && (i+2) < best_choice->length() && non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) { best_choice->set_unichar_id(unichar_O, i); i++; } /* AA0<non digit or end of word> */ if ((i > 1) && non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) && non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) && (((i+1) < best_choice->length() && !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) && !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") && !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) || (i == best_choice->length() - 1))) { best_choice->set_unichar_id(unichar_O, i); } /* 9O9 */ if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) && (i+1) < best_choice->length() && non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) { best_choice->set_unichar_id(unichar_0, i); } /* 9OOO */ if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) && (i+2) < best_choice->length() && (best_choice->unichar_id(i+1) == unichar_0 || best_choice->unichar_id(i+1) == unichar_O) && (best_choice->unichar_id(i+2) == unichar_0 || best_choice->unichar_id(i+2) == unichar_O)) { best_choice->set_unichar_id(unichar_0, i); best_choice->set_unichar_id(unichar_0, i+1); best_choice->set_unichar_id(unichar_0, i+2); i += 2; } /* 9OO<non upper> */ if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) && (i+2) < best_choice->length() && (best_choice->unichar_id(i+1) == unichar_0 || best_choice->unichar_id(i+1) == unichar_O) && !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) { best_choice->set_unichar_id(unichar_0, i); best_choice->set_unichar_id(unichar_0, i+1); i++; } /* 9O<non upper> */ if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) && (i+1) < best_choice->length() && !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) { best_choice->set_unichar_id(unichar_0, i); } /* 9[.,]OOO.. */ if ((i > 1) && (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") || word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) && (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) || best_choice->unichar_id(i-2) == unichar_O)) { if (best_choice->unichar_id(i-2) == unichar_O) { best_choice->set_unichar_id(unichar_0, i-2); } while (i < best_choice->length() && (best_choice->unichar_id(i) == unichar_O || best_choice->unichar_id(i) == unichar_0)) { best_choice->set_unichar_id(unichar_0, i); i++; } i--; } } } }
/** * Return whether this is believable superscript or subscript text. * * We insist that: * + there are no punctuation marks. * + there are no italics. * + no normal-sized character is smaller than superscript_scaledown_ratio * of what it ought to be, and * + each character is at least as certain as certainty_threshold. * * @param[in] debug If true, spew debug output * @param[in] word The word whose best_choice we're evaluating * @param[in] certainty_threshold If any of the characters have less * certainty than this, reject. * @param[out] left_ok How many left-side characters were ok? * @param[out] right_ok How many right-side characters were ok? * @return Whether the complete best choice is believable as a superscript. */ bool Tesseract::BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const { int initial_ok_run_count = 0; int ok_run_count = 0; float worst_certainty = 0.0f; const WERD_CHOICE &wc = *word.best_choice; const UnicityTable<FontInfo>& fontinfo_table = get_fontinfo_table(); for (int i = 0; i < wc.length(); i++) { TBLOB *blob = word.rebuild_word->blobs[i]; UNICHAR_ID unichar_id = wc.unichar_id(i); float char_certainty = wc.certainty(i); bool bad_certainty = char_certainty < certainty_threshold; bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id); bool is_italic = word.fontinfo && word.fontinfo->is_italic(); BLOB_CHOICE *choice = word.GetBlobChoice(i); if (choice && fontinfo_table.size() > 0) { // Get better information from the specific choice, if available. int font_id1 = choice->fontinfo_id(); bool font1_is_italic = font_id1 >= 0 ? fontinfo_table.get(font_id1).is_italic() : false; int font_id2 = choice->fontinfo_id2(); is_italic = font1_is_italic && (font_id2 < 0 || fontinfo_table.get(font_id2).is_italic()); } float height_fraction = 1.0f; float char_height = blob->bounding_box().height(); float normal_height = char_height; if (wc.unicharset()->top_bottom_useful()) { int min_bot, max_bot, min_top, max_top; wc.unicharset()->get_top_bottom(unichar_id, &min_bot, &max_bot, &min_top, &max_top); float hi_height = max_top - max_bot; float lo_height = min_top - min_bot; normal_height = (hi_height + lo_height) / 2; if (normal_height >= kBlnXHeight) { // Only ding characters that we have decent information for because // they're supposed to be normal sized, not tiny specks or dashes. height_fraction = char_height / normal_height; } } bool bad_height = height_fraction < superscript_scaledown_ratio; if (debug) { if (is_italic) { tprintf(" Rejecting: superscript is italic.\n"); } if (is_punc) { tprintf(" Rejecting: punctuation present.\n"); } const char *char_str = wc.unicharset()->id_to_unichar(unichar_id); if (bad_certainty) { tprintf(" Rejecting: don't believe character %s with certainty %.2f " "which is less than threshold %.2f\n", char_str, char_certainty, certainty_threshold); } if (bad_height) { tprintf(" Rejecting: character %s seems too small @ %.2f versus " "expected %.2f\n", char_str, char_height, normal_height); } } if (bad_certainty || bad_height || is_punc || is_italic) { if (ok_run_count == i) { initial_ok_run_count = ok_run_count; } ok_run_count = 0; } else { ok_run_count++; } if (char_certainty < worst_certainty) { worst_certainty = char_certainty; } } bool all_ok = ok_run_count == wc.length(); if (all_ok && debug) { tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty); } if (!all_ok) { if (left_ok) *left_ok = initial_ok_run_count; if (right_ok) *right_ok = ok_run_count; } return all_ok; }