// Helper to compute edge offsets for all the blobs on the list. // See coutln.h for an explanation of edge offsets. void BLOBNBOX::ComputeEdgeOffsets(Pix* thresholds, Pix* grey, BLOBNBOX_LIST* blobs) { int grey_height = 0; int thr_height = 0; int scale_factor = 1; if (thresholds != NULL && grey != NULL) { grey_height = pixGetHeight(grey); thr_height = pixGetHeight(thresholds); scale_factor = IntCastRounded(static_cast<double>(grey_height) / thr_height); } BLOBNBOX_IT blob_it(blobs); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { BLOBNBOX* blob = blob_it.data(); if (blob->cblob() != NULL) { // Get the threshold that applies to this blob. l_uint32 threshold = 128; if (thresholds != NULL && grey != NULL) { const TBOX& box = blob->cblob()->bounding_box(); // Transform the coordinates if required. TPOINT pt((box.left() + box.right()) / 2, (box.top() + box.bottom()) / 2); pixGetPixel(thresholds, pt.x / scale_factor, thr_height - 1 - pt.y / scale_factor, &threshold); } blob->cblob()->ComputeEdgeOffsets(threshold, grey); } } }
// Reorganize the blob lists with a different definition of small, medium // and large, compared to the original definition. // Height is still the primary filter key, but medium width blobs of small // height become small, and very wide blobs of small height stay noise, along // with small dot-shaped blobs. void StrokeWidth::ReFilterBlobs(TO_BLOCK* block) { int min_height = static_cast<int>(textord_strokewidth_minsize * block->line_size + 0.5); int max_height = static_cast<int>(textord_strokewidth_maxsize * block->line_size + 0.5); BLOBNBOX_LIST noise_list; BLOBNBOX_LIST small_list; BLOBNBOX_LIST medium_list; BLOBNBOX_LIST large_list; SizeFilterBlobs(min_height, max_height, &block->blobs, &noise_list, &small_list, &medium_list, &large_list); SizeFilterBlobs(min_height, max_height, &block->large_blobs, &noise_list, &small_list, &medium_list, &large_list); SizeFilterBlobs(min_height, max_height, &block->small_blobs, &noise_list, &small_list, &medium_list, &large_list); SizeFilterBlobs(min_height, max_height, &block->noise_blobs, &noise_list, &small_list, &medium_list, &large_list); BLOBNBOX_IT blob_it(&block->blobs); blob_it.add_list_after(&medium_list); blob_it.set_to_list(&block->large_blobs); blob_it.add_list_after(&large_list); blob_it.set_to_list(&block->small_blobs); blob_it.add_list_after(&small_list); blob_it.set_to_list(&block->noise_blobs); blob_it.add_list_after(&noise_list); }
// Converts the Boxa array to a list of C_BLOB, getting rid of severely // overlapping outlines and those that are children of a bigger one. // The output is a list of C_BLOBs that are owned by the list. // The C_OUTLINEs in the C_BLOBs contain no outline data - just empty // bounding boxes. The Boxa is consumed and destroyed. void LineFinder::ConvertBoxaToBlobs(int image_width, int image_height, Boxa** boxes, C_BLOB_LIST* blobs) { #ifdef HAVE_LIBLEPT C_OUTLINE_LIST outlines; C_OUTLINE_IT ol_it = &outlines; // Iterate the boxes to convert to outlines. int nboxes = boxaGetCount(*boxes); for (int i = 0; i < nboxes; ++i) { l_int32 x, y, width, height; boxaGetBoxGeometry(*boxes, i, &x, &y, &width, &height); // Make a C_OUTLINE from the leptonica box. This is a bit of a hack, // as there is no outline, just a bounding box, but with some very // small changes to coutln.cpp, it works nicely. ICOORD top_left(x, image_height - y); ICOORD bot_right(x + width, image_height - (y + height)); CRACKEDGE startpt; startpt.pos = top_left; C_OUTLINE* outline = new C_OUTLINE(&startpt, top_left, bot_right, 0); ol_it.add_after_then_move(outline); } // Use outlines_to_blobs to convert the outlines to blobs and find // overlapping and contained objects. The output list of blobs in the block // has all the bad ones filtered out and deleted. BLOCK block; ICOORD page_tl(0, 0); ICOORD page_br(image_width, image_height); outlines_to_blobs(&block, page_tl, page_br, &outlines); // Transfer the created blobs to the output list. C_BLOB_IT blob_it(blobs); blob_it.add_list_after(block.blob_list()); // The boxes aren't needed any more. boxaDestroy(boxes); #endif }
// Remove outlines that are a tiny fraction in either width or height // of the word height. void Textord::clean_small_noise_from_words(ROW *row) { WERD_IT word_it(row->word_list()); for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { WERD* word = word_it.data(); int min_size = static_cast<int>( textord_noise_hfract * word->bounding_box().height() + 0.5); C_BLOB_IT blob_it(word->cblob_list()); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* blob = blob_it.data(); C_OUTLINE_IT out_it(blob->out_list()); for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) { C_OUTLINE* outline = out_it.data(); outline->RemoveSmallRecursive(min_size, &out_it); } if (blob->out_list()->empty()) { delete blob_it.extract(); } } if (word->cblob_list()->empty()) { if (!word_it.at_last()) { // The next word is no longer a fuzzy non space if it was before, // since the word before is about to be deleted. WERD* next_word = word_it.data_relative(1); if (next_word->flag(W_FUZZY_NON)) { next_word->set_flag(W_FUZZY_NON, false); } } delete word_it.extract(); } } }
PAGE_RES_IT* make_pseudo_word(PAGE_RES* page_res, const TBOX& selection_box) { PAGE_RES_IT pr_it(page_res); C_BLOB_LIST new_blobs; // list of gathered blobs C_BLOB_IT new_blob_it = &new_blobs; // iterator for (WERD_RES* word_res = pr_it.word(); word_res != NULL; word_res = pr_it.forward()) { WERD* word = word_res->word; if (word->bounding_box().overlap(selection_box)) { C_BLOB_IT blob_it(word->cblob_list()); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* blob = blob_it.data(); if (blob->bounding_box().overlap(selection_box)) { new_blob_it.add_after_then_move(C_BLOB::deep_copy(blob)); } } if (!new_blobs.empty()) { WERD* pseudo_word = new WERD(&new_blobs, 1, NULL); word_res = pr_it.InsertSimpleCloneWord(*word_res, pseudo_word); PAGE_RES_IT* it = new PAGE_RES_IT(page_res); while (it->word() != word_res && it->word() != NULL) it->forward(); ASSERT_HOST(it->word() == word_res); return it; } } } return NULL; }
// Computes the noise_density_ by summing the number of elements in a // neighbourhood of each grid cell. void StrokeWidth::ComputeNoiseDensity(TO_BLOCK* block, TabFind* line_grid) { // Run a preliminary strokewidth neighbour detection on the medium blobs. line_grid->InsertBlobList(true, true, false, &block->blobs, false, this); BLOBNBOX_IT blob_it(&block->blobs); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { SetNeighbours(false, blob_it.data()); } // Remove blobs with a good strokewidth neighbour from the grid. for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { BLOBNBOX* blob = blob_it.data(); if (blob->GoodTextBlob() > 0) RemoveBBox(blob); blob->ClearNeighbours(); } // Insert the smaller blobs into the grid. line_grid->InsertBlobList(true, true, false, &block->small_blobs, false, this); line_grid->InsertBlobList(true, true, false, &block->noise_blobs, false, this); if (noise_density_ != NULL) delete noise_density_; IntGrid* cell_counts = CountCellElements(); noise_density_ = cell_counts->NeighbourhoodSum(); delete cell_counts; // Clear the grid as we don't want the small stuff hanging around in it. Clear(); }
// Inserts a list of blobs into the projection. // Rotation is a multiple of 90 degrees to get from blob coords to // nontext_map coords, nontext_map_box is the bounds of the nontext_map. // Blobs are spread horizontally or vertically according to their internal // flags, but the spreading is truncated by set pixels in the nontext_map // and also by the horizontal rule line limits on the blobs. void TextlineProjection::ProjectBlobs(BLOBNBOX_LIST* blobs, const FCOORD& rotation, const TBOX& nontext_map_box, Pix* nontext_map) { BLOBNBOX_IT blob_it(blobs); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { BLOBNBOX* blob = blob_it.data(); TBOX bbox = blob->bounding_box(); ICOORD middle((bbox.left() + bbox.right()) / 2, (bbox.bottom() + bbox.top()) / 2); bool spreading_horizontally = PadBlobBox(blob, &bbox); // Rotate to match the nontext_map. bbox.rotate(rotation); middle.rotate(rotation); if (rotation.x() == 0.0f) spreading_horizontally = !spreading_horizontally; // Clip to the image before applying the increments. bbox &= nontext_map_box; // This is in-place box intersection. // Check for image pixels before spreading. TruncateBoxToMissNonText(middle.x(), middle.y(), spreading_horizontally, nontext_map, &bbox); if (bbox.area() > 0) { IncrementRectangle8Bit(bbox); } } }
// Inserts all the blobs from the given list, with x and y spreading, // without removing from the source list, so ownership remains with the // source list. void BlobGrid::InsertBlobList(BLOBNBOX_LIST * blobs) { BLOBNBOX_IT blob_it(blobs); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { BLOBNBOX *blob = blob_it.data(); if (!blob->joined_to_prev()) InsertBBox(true, true, blob); } }
BOOL8 suspect_em(WERD_RES *word, inT16 index) { PBLOB_LIST *blobs = word->outword->blob_list (); PBLOB_IT blob_it(blobs); inT16 j; for (j = 0; j < index; j++) blob_it.forward (); return (blob_it.data ()->out_list ()->length () != 1); }
// Helper to delete all the deletable blobs on the list. void BLOBNBOX::DeleteNoiseBlobs(BLOBNBOX_LIST* blobs) { BLOBNBOX_IT blob_it(blobs); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { BLOBNBOX* blob = blob_it.data(); if (blob->DeletableNoise()) { delete blob->cblob(); delete blob_it.extract(); } } }
// Finds horizontal line objects in the given pix. // Uses the given resolution to determine size thresholds instead of any // that may be present in the pix. // The output vectors are owned by the list and Frozen (cannot refit) by // having no boxes, as there is no need to refit or merge separator lines. void LineFinder::FindHorizontalLines(int resolution, Pix* pix, TabVector_LIST* vectors) { #ifdef HAVE_LIBLEPT Pix* line_pix; Boxa* boxes = GetHLineBoxes(resolution, pix, &line_pix); C_BLOB_LIST line_cblobs; int width = pixGetWidth(pix); int height = pixGetHeight(pix); ConvertBoxaToBlobs(height, width, &boxes, &line_cblobs); // Make the BLOBNBOXes from the C_BLOBs. BLOBNBOX_LIST line_bblobs; C_BLOB_IT blob_it(&line_cblobs); BLOBNBOX_IT bbox_it(&line_bblobs); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* cblob = blob_it.data(); BLOBNBOX* bblob = new BLOBNBOX(cblob); bbox_it.add_to_end(bblob); } ICOORD bleft(0, 0); ICOORD tright(height, width); int vertical_x, vertical_y; FindLineVectors(bleft, tright, &line_bblobs, &vertical_x, &vertical_y, vectors); if (!vectors->empty()) { // Some lines were found, so erase the unused blobs from the line image // and then subtract the line image from the source. bbox_it.move_to_first(); for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) { BLOBNBOX* blob = bbox_it.data(); if (blob->left_tab_type() == TT_UNCONFIRMED) { const TBOX& box = blob->bounding_box(); // Coords are in tess format so filp x and y and then covert // to leptonica by height -y. Box* pixbox = boxCreate(box.bottom(), height - box.right(), box.height(), box.width()); pixClearInRect(line_pix, pixbox); boxDestroy(&pixbox); } } pixDilateBrick(line_pix, line_pix, 3, 1); pixSubtract(pix, pix, line_pix); if (textord_tabfind_show_vlines) pixWrite("hlinesclean.png", line_pix, IFF_PNG); ICOORD vertical; vertical.set_with_shrink(vertical_x, vertical_y); TabVector::MergeSimilarTabVectors(vertical, vectors, NULL); // Iterate the vectors to flip them. TabVector_IT h_it(vectors); for (h_it.mark_cycle_pt(); !h_it.cycled_list(); h_it.forward()) { h_it.data()->XYFlip(); } } pixDestroy(&line_pix); #endif }
// Tests each blob in the list to see if it is certain non-text using 2 // conditions: // 1. blob overlaps a cell with high value in noise_density_ (previously set // by ComputeNoiseDensity). // OR 2. The blob overlaps more than max_blob_overlaps in *this grid. This // condition is disabled with max_blob_overlaps == -1. // If it does, the blob is declared non-text, and is used to mark up the // nontext_mask. Such blobs are fully deleted, and non-noise blobs have their // neighbours reset, as they may now point to deleted data. // WARNING: The blobs list blobs may be in the *this grid, but they are // not removed. If any deleted blobs might be in *this, then this must be // Clear()ed immediately after MarkAndDeleteNonTextBlobs is called. // If the win is not NULL, deleted blobs are drawn on it in red, and kept // blobs are drawn on it in ok_color. void CCNonTextDetect::MarkAndDeleteNonTextBlobs(BLOBNBOX_LIST* blobs, int max_blob_overlaps, ScrollView* win, ScrollView::Color ok_color, Pix* nontext_mask) { int imageheight = tright().y() - bleft().x(); BLOBNBOX_IT blob_it(blobs); BLOBNBOX_LIST dead_blobs; BLOBNBOX_IT dead_it(&dead_blobs); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { BLOBNBOX* blob = blob_it.data(); TBOX box = blob->bounding_box(); if (!noise_density_->RectMostlyOverThreshold(box, max_noise_count_) && (max_blob_overlaps < 0 || !BlobOverlapsTooMuch(blob, max_blob_overlaps))) { blob->ClearNeighbours(); #ifndef GRAPHICS_DISABLED if (win != NULL) blob->plot(win, ok_color, ok_color); #endif // GRAPHICS_DISABLED } else { if (noise_density_->AnyZeroInRect(box)) { // There is a danger that the bounding box may overlap real text, so // we need to render the outline. Pix* blob_pix = blob->cblob()->render_outline(); pixRasterop(nontext_mask, box.left(), imageheight - box.top(), box.width(), box.height(), PIX_SRC | PIX_DST, blob_pix, 0, 0); pixDestroy(&blob_pix); } else { if (box.area() < gridsize() * gridsize()) { // It is a really bad idea to make lots of small components in the // photo mask, so try to join it to a bigger area by expanding the // box in a way that does not touch any zero noise density cell. box = AttemptBoxExpansion(box, *noise_density_, gridsize()); } // All overlapped cells are non-zero, so just mark the rectangle. pixRasterop(nontext_mask, box.left(), imageheight - box.top(), box.width(), box.height(), PIX_SET, NULL, 0, 0); } #ifndef GRAPHICS_DISABLED if (win != NULL) blob->plot(win, ScrollView::RED, ScrollView::RED); #endif // GRAPHICS_DISABLED // It is safe to delete the cblob now, as it isn't used by the grid // or BlobOverlapsTooMuch, and the BLOBNBOXes will go away with the // dead_blobs list. // TODO(rays) delete the delete when the BLOBNBOX destructor deletes // the cblob. delete blob->cblob(); dead_it.add_to_end(blob_it.extract()); } } }
// Adds the selected outlines to the indcated real blobs, and puts the rest // back in rej_cblobs where they came from. Where the target_blobs entry is // nullptr, a run of wanted outlines is put into a single new blob. // Ownership of the outlines is transferred back to the word. (Hence // GenericVector and not PointerVector.) // Returns true if any new blob was added to the start of the word, which // suggests that it might need joining to the word before it, and likewise // sets make_next_word_fuzzy true if any new blob was added to the end. bool WERD::AddSelectedOutlines(const GenericVector<bool>& wanted, const GenericVector<C_BLOB*>& target_blobs, const GenericVector<C_OUTLINE*>& outlines, bool* make_next_word_fuzzy) { bool outline_added_to_start = false; if (make_next_word_fuzzy != nullptr) *make_next_word_fuzzy = false; C_BLOB_IT rej_it(&rej_cblobs); for (int i = 0; i < outlines.size(); ++i) { C_OUTLINE* outline = outlines[i]; if (outline == nullptr) continue; // Already used it. if (wanted[i]) { C_BLOB* target_blob = target_blobs[i]; TBOX noise_box = outline->bounding_box(); if (target_blob == nullptr) { target_blob = new C_BLOB(outline); // Need to find the insertion point. C_BLOB_IT blob_it(&cblobs); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* blob = blob_it.data(); TBOX blob_box = blob->bounding_box(); if (blob_box.left() > noise_box.left()) { if (blob_it.at_first() && !flag(W_FUZZY_SP) && !flag(W_FUZZY_NON)) { // We might want to join this word to its predecessor. outline_added_to_start = true; } blob_it.add_before_stay_put(target_blob); break; } } if (blob_it.cycled_list()) { blob_it.add_to_end(target_blob); if (make_next_word_fuzzy != nullptr) *make_next_word_fuzzy = true; } // Add all consecutive wanted, but null-blob outlines to same blob. C_OUTLINE_IT ol_it(target_blob->out_list()); while (i + 1 < outlines.size() && wanted[i + 1] && target_blobs[i + 1] == nullptr) { ++i; ol_it.add_to_end(outlines[i]); } } else { // Insert outline into this blob. C_OUTLINE_IT ol_it(target_blob->out_list()); ol_it.add_to_end(outline); } } else { // Put back on noise list. rej_it.add_to_end(new C_BLOB(outline)); } } return outline_added_to_start; }
void WERD::join_on(WERD* other) { C_BLOB_IT blob_it(&cblobs); C_BLOB_IT src_it(&other->cblobs); C_BLOB_IT rej_cblob_it(&rej_cblobs); C_BLOB_IT src_rej_it(&other->rej_cblobs); while (!src_it.empty()) { blob_it.add_to_end(src_it.extract()); src_it.forward(); } while (!src_rej_it.empty()) { rej_cblob_it.add_to_end(src_rej_it.extract()); src_rej_it.forward(); } }
// Finds vertical line objects in the given pix. // Uses the given resolution to determine size thresholds instead of any // that may be present in the pix. // The output vertical_x and vertical_y contain a sum of the output vectors, // thereby giving the mean vertical direction. // The output vectors are owned by the list and Frozen (cannot refit) by // having no boxes, as there is no need to refit or merge separator lines. void LineFinder::FindVerticalLines(int resolution, Pix* pix, int* vertical_x, int* vertical_y, TabVector_LIST* vectors) { #ifdef HAVE_LIBLEPT Pix* line_pix; Boxa* boxes = GetVLineBoxes(resolution, pix, &line_pix); C_BLOB_LIST line_cblobs; int width = pixGetWidth(pix); int height = pixGetHeight(pix); ConvertBoxaToBlobs(width, height, &boxes, &line_cblobs); // Make the BLOBNBOXes from the C_BLOBs. BLOBNBOX_LIST line_bblobs; C_BLOB_IT blob_it(&line_cblobs); BLOBNBOX_IT bbox_it(&line_bblobs); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* cblob = blob_it.data(); BLOBNBOX* bblob = new BLOBNBOX(cblob); bbox_it.add_to_end(bblob); } ICOORD bleft(0, 0); ICOORD tright(width, height); FindLineVectors(bleft, tright, &line_bblobs, vertical_x, vertical_y, vectors); if (!vectors->empty()) { // Some lines were found, so erase the unused blobs from the line image // and then subtract the line image from the source. bbox_it.move_to_first(); for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) { BLOBNBOX* blob = bbox_it.data(); if (blob->left_tab_type() == TT_UNCONFIRMED) { const TBOX& box = blob->bounding_box(); Box* pixbox = boxCreate(box.left(), height - box.top(), box.width(), box.height()); pixClearInRect(line_pix, pixbox); boxDestroy(&pixbox); } } pixDilateBrick(line_pix, line_pix, 1, 3); pixSubtract(pix, pix, line_pix); if (textord_tabfind_show_vlines) pixWrite("vlinesclean.png", line_pix, IFF_PNG); ICOORD vertical; vertical.set_with_shrink(*vertical_x, *vertical_y); TabVector::MergeSimilarTabVectors(vertical, vectors, NULL); } pixDestroy(&line_pix); #endif }
// Sets up displacement_modes_ with the top few modes of the perpendicular // distance of each blob from the given direction vector, after rounding. void BaselineRow::SetupBlobDisplacements(const FCOORD& direction) { // Set of perpendicular displacements of the blob bottoms from the required // baseline direction. GenericVector<double> perp_blob_dists; displacement_modes_.truncate(0); // Gather the skew-corrected position of every blob. double min_dist = MAX_FLOAT32; double max_dist = -MAX_FLOAT32; BLOBNBOX_IT blob_it(blobs_); bool debug = false; for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { BLOBNBOX* blob = blob_it.data(); const TBOX& box = blob->bounding_box(); #ifdef kDebugYCoord if (box.bottom() < kDebugYCoord && box.top() > kDebugYCoord) debug = true; #endif FCOORD blob_pos((box.left() + box.right()) / 2.0f, blob->baseline_position()); double offset = direction * blob_pos; perp_blob_dists.push_back(offset); if (debug) { tprintf("Displacement %g for blob at:", offset); box.print(); } UpdateRange(offset, &min_dist, &max_dist); } // Set up a histogram using disp_quant_factor_ as the bucket size. STATS dist_stats(IntCastRounded(min_dist / disp_quant_factor_), IntCastRounded(max_dist / disp_quant_factor_) + 1); for (int i = 0; i < perp_blob_dists.size(); ++i) { dist_stats.add(IntCastRounded(perp_blob_dists[i] / disp_quant_factor_), 1); } GenericVector<KDPairInc<float, int> > scaled_modes; dist_stats.top_n_modes(kMaxDisplacementsModes, &scaled_modes); if (debug) { for (int i = 0; i < scaled_modes.size(); ++i) { tprintf("Top mode = %g * %d\n", scaled_modes[i].key * disp_quant_factor_, scaled_modes[i].data); } } for (int i = 0; i < scaled_modes.size(); ++i) displacement_modes_.push_back(disp_quant_factor_ * scaled_modes[i].key); }
// Removes noise from the word by moving small outlines to the rej_cblobs // list, based on the size_threshold. void WERD::CleanNoise(float size_threshold) { C_BLOB_IT blob_it(&cblobs); C_BLOB_IT rej_it(&rej_cblobs); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* blob = blob_it.data(); C_OUTLINE_IT ol_it(blob->out_list()); for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) { C_OUTLINE* outline = ol_it.data(); TBOX ol_box = outline->bounding_box(); int ol_size = ol_box.width() > ol_box.height() ? ol_box.width() : ol_box.height(); if (ol_size < size_threshold) { // This outline is too small. Move it to a separate blob in the // reject blobs list. C_BLOB* rej_blob = new C_BLOB(ol_it.extract()); rej_it.add_after_then_move(rej_blob); } } if (blob->out_list()->empty()) delete blob_it.extract(); } }
BOOL8 suspect_fullstop(WERD_RES *word, inT16 i) { float aspect_ratio; PBLOB_LIST *blobs = word->outword->blob_list (); PBLOB_IT blob_it(blobs); inT16 j; TBOX box; inT16 width; inT16 height; for (j = 0; j < i; j++) blob_it.forward (); box = blob_it.data ()->bounding_box (); width = box.width (); height = box.height (); aspect_ratio = ((width > height) ? ((float) width) / height : ((float) height) / width); return (aspect_ratio > tessed_fullstop_aspect_ratio); }
// Fixes the block so it obeys all the rules: // Must have at least one ROW. // Must have at least one WERD. // WERDs contain a fake blob. void Textord::cleanup_nontext_block(BLOCK* block) { // Non-text blocks must contain at least one row. ROW_IT row_it(block->row_list()); if (row_it.empty()) { const TBOX& box = block->pdblk.bounding_box(); float height = box.height(); int32_t xstarts[2] = {box.left(), box.right()}; double coeffs[3] = {0.0, 0.0, static_cast<double>(box.bottom())}; ROW* row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f, height / 4.0f, 0, 1); row_it.add_after_then_move(row); } // Each row must contain at least one word. for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { ROW* row = row_it.data(); WERD_IT w_it(row->word_list()); if (w_it.empty()) { // Make a fake blob to put in the word. TBOX box = block->row_list()->singleton() ? block->pdblk.bounding_box() : row->bounding_box(); C_BLOB* blob = C_BLOB::FakeBlob(box); C_BLOB_LIST blobs; C_BLOB_IT blob_it(&blobs); blob_it.add_after_then_move(blob); WERD* word = new WERD(&blobs, 0, nullptr); w_it.add_after_then_move(word); } // Each word must contain a fake blob. for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { WERD* word = w_it.data(); // Just assert that this is true, as it would be useful to find // out why it isn't. ASSERT_HOST(!word->cblob_list()->empty()); } row->recalc_bounding_box(); } }
// Reorganize the blob lists with a different definition of small, medium // and large, compared to the original definition. // Height is still the primary filter key, but medium width blobs of small // height become small, and very wide blobs of small height stay noise, along // with small dot-shaped blobs. void TO_BLOCK::ReSetAndReFilterBlobs() { int min_height = IntCastRounded(kMinMediumSizeRatio * line_size); int max_height = IntCastRounded(kMaxMediumSizeRatio * line_size); BLOBNBOX_LIST noise_list; BLOBNBOX_LIST small_list; BLOBNBOX_LIST medium_list; BLOBNBOX_LIST large_list; SizeFilterBlobs(min_height, max_height, &blobs, &noise_list, &small_list, &medium_list, &large_list); SizeFilterBlobs(min_height, max_height, &large_blobs, &noise_list, &small_list, &medium_list, &large_list); SizeFilterBlobs(min_height, max_height, &small_blobs, &noise_list, &small_list, &medium_list, &large_list); SizeFilterBlobs(min_height, max_height, &noise_blobs, &noise_list, &small_list, &medium_list, &large_list); BLOBNBOX_IT blob_it(&blobs); blob_it.add_list_after(&medium_list); blob_it.set_to_list(&large_blobs); blob_it.add_list_after(&large_list); blob_it.set_to_list(&small_blobs); blob_it.add_list_after(&small_list); blob_it.set_to_list(&noise_blobs); blob_it.add_list_after(&noise_list); }
// Helper to call CleanNeighbours on all blobs on the list. void BLOBNBOX::CleanNeighbours(BLOBNBOX_LIST* blobs) { BLOBNBOX_IT blob_it(blobs); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { blob_it.data()->CleanNeighbours(); } }
// Fits a straight baseline to the points. Returns true if it had enough // points to be reasonably sure of the fitted baseline. // If use_box_bottoms is false, baselines positions are formed by // considering the outlines of the blobs. bool BaselineRow::FitBaseline(bool use_box_bottoms) { // Deterministic fitting is used wherever possible. fitter_.Clear(); // Linear least squares is a backup if the DetLineFit produces a bad line. LLSQ llsq; BLOBNBOX_IT blob_it(blobs_); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { BLOBNBOX* blob = blob_it.data(); if (!use_box_bottoms) blob->EstimateBaselinePosition(); const TBOX& box = blob->bounding_box(); int x_middle = (box.left() + box.right()) / 2; #ifdef kDebugYCoord if (box.bottom() < kDebugYCoord && box.top() > kDebugYCoord) { tprintf("Box bottom = %d, baseline pos=%d for box at:", box.bottom(), blob->baseline_position()); box.print(); } #endif fitter_.Add(ICOORD(x_middle, blob->baseline_position()), box.width() / 2); llsq.add(x_middle, blob->baseline_position()); } // Fit the line. ICOORD pt1, pt2; baseline_error_ = fitter_.Fit(&pt1, &pt2); baseline_pt1_ = pt1; baseline_pt2_ = pt2; if (baseline_error_ > max_baseline_error_ && fitter_.SufficientPointsForIndependentFit()) { // The fit was bad but there were plenty of points, so try skipping // the first and last few, and use the new line if it dramatically improves // the error of fit. double error = fitter_.Fit(kNumSkipPoints, kNumSkipPoints, &pt1, &pt2); if (error < baseline_error_ / 2.0) { baseline_error_ = error; baseline_pt1_ = pt1; baseline_pt2_ = pt2; } } int debug = 0; #ifdef kDebugYCoord Print(); debug = bounding_box_.bottom() < kDebugYCoord && bounding_box_.top() > kDebugYCoord ? 3 : 2; #endif // Now we obtained a direction from that fit, see if we can improve the // fit using the same direction and some other start point. FCOORD direction(pt2 - pt1); double target_offset = direction * pt1; good_baseline_ = false; FitConstrainedIfBetter(debug, direction, 0.0, target_offset); // Wild lines can be produced because DetLineFit allows vertical lines, but // vertical text has been rotated so angles over pi/4 should be disallowed. // Near vertical lines can still be produced by vertically aligned components // on very short lines. double angle = BaselineAngle(); if (fabs(angle) > M_PI * 0.25) { // Use the llsq fit as a backup. baseline_pt1_ = llsq.mean_point(); baseline_pt2_ = baseline_pt1_ + FCOORD(1.0f, llsq.m()); // TODO(rays) get rid of this when m and c are no longer used. double m = llsq.m(); double c = llsq.c(m); baseline_error_ = llsq.rms(m, c); good_baseline_ = false; } return good_baseline_; }
/// Consume all source blobs that strongly overlap the given box, /// putting them into a new word, with the correct_text label. /// Fights over which box owns which blobs are settled by /// applying the blobs to box or next_box with the least non-overlap. /// @return false if the box was in error, which can only be caused by /// failing to find an overlapping blob for a box. bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list, const TBOX& box, const TBOX& next_box, const char* correct_text) { if (applybox_debug > 1) { tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text); } WERD* new_word = NULL; BLOCK_IT b_it(block_list); for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { BLOCK* block = b_it.data(); if (!box.major_overlap(block->bounding_box())) continue; ROW_IT r_it(block->row_list()); for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) { ROW* row = r_it.data(); if (!box.major_overlap(row->bounding_box())) continue; WERD_IT w_it(row->word_list()); for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { WERD* word = w_it.data(); if (applybox_debug > 2) { tprintf("Checking word:"); word->bounding_box().print(); } if (word->text() != NULL && word->text()[0] != '\0') continue; // Ignore words that are already done. if (!box.major_overlap(word->bounding_box())) continue; C_BLOB_IT blob_it(word->cblob_list()); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* blob = blob_it.data(); TBOX blob_box = blob->bounding_box(); if (!blob_box.major_overlap(box)) continue; double current_box_miss_metric = BoxMissMetric(blob_box, box); double next_box_miss_metric = BoxMissMetric(blob_box, next_box); if (applybox_debug > 2) { tprintf("Checking blob:"); blob_box.print(); tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric, next_box_miss_metric); } if (current_box_miss_metric > next_box_miss_metric) continue; // Blob is a better match for next box. if (applybox_debug > 2) { tprintf("Blob match: blob:"); blob_box.print(); tprintf("Matches box:"); box.print(); tprintf("With next box:"); next_box.print(); } if (new_word == NULL) { // Make a new word with a single blob. new_word = word->shallow_copy(); new_word->set_text(correct_text); w_it.add_to_end(new_word); } C_BLOB_IT new_blob_it(new_word->cblob_list()); new_blob_it.add_to_end(blob_it.extract()); } } } } if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n"); return new_word != NULL; }
// Places a copy of blobs that are near a word (after applying rotation to the // blob) in the most appropriate word, unless there is doubt, in which case a // blob can end up in two words. Source blobs are not touched. void Textord::TransferDiacriticsToWords(BLOBNBOX_LIST* diacritic_blobs, const FCOORD& rotation, WordGrid* word_grid) { WordSearch ws(word_grid); BLOBNBOX_IT b_it(diacritic_blobs); // Apply rotation to each blob before finding the nearest words. The rotation // allows us to only consider above/below placement and not left/right on // vertical text, because all text is horizontal here. for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { BLOBNBOX* blobnbox = b_it.data(); TBOX blob_box = blobnbox->bounding_box(); blob_box.rotate(rotation); ws.StartRectSearch(blob_box); // Above/below refer to word position relative to diacritic. Since some // scripts eg Kannada/Telugu habitually put diacritics below words, and // others eg Thai/Vietnamese/Latin put most diacritics above words, try // for both if there isn't much in it. WordWithBox* best_above_word = nullptr; WordWithBox* best_below_word = nullptr; int best_above_distance = 0; int best_below_distance = 0; for (WordWithBox* word = ws.NextRectSearch(); word != nullptr; word = ws.NextRectSearch()) { if (word->word()->flag(W_REP_CHAR)) continue; TBOX word_box = word->true_bounding_box(); int x_distance = blob_box.x_gap(word_box); int y_distance = blob_box.y_gap(word_box); if (x_distance > 0) { // Arbitrarily divide x-distance by 2 if there is a major y overlap, // and the word is to the left of the diacritic. If the // diacritic is a dropped broken character between two words, this will // help send all the pieces to a single word, instead of splitting them // over the 2 words. if (word_box.major_y_overlap(blob_box) && blob_box.left() > word_box.right()) { x_distance /= 2; } y_distance += x_distance; } if (word_box.y_middle() > blob_box.y_middle() && (best_above_word == nullptr || y_distance < best_above_distance)) { best_above_word = word; best_above_distance = y_distance; } if (word_box.y_middle() <= blob_box.y_middle() && (best_below_word == nullptr || y_distance < best_below_distance)) { best_below_word = word; best_below_distance = y_distance; } } bool above_good = best_above_word != nullptr && (best_below_word == nullptr || best_above_distance < best_below_distance + blob_box.height()); bool below_good = best_below_word != nullptr && best_below_word != best_above_word && (best_above_word == nullptr || best_below_distance < best_above_distance + blob_box.height()); if (below_good) { C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob()); copied_blob->rotate(rotation); // Put the blob into the word's reject blobs list. C_BLOB_IT blob_it(best_below_word->RejBlobs()); blob_it.add_to_end(copied_blob); } if (above_good) { C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob()); copied_blob->rotate(rotation); // Put the blob into the word's reject blobs list. C_BLOB_IT blob_it(best_above_word->RejBlobs()); blob_it.add_to_end(copied_blob); } } }
void collect_ems_for_adaption(WERD_RES *word, CHAR_SAMPLES_LIST *char_clusters, CHAR_SAMPLE_LIST *chars_waiting) { PBLOB_LIST *blobs = word->outword->blob_list (); PBLOB_IT blob_it(blobs); inT16 i; CHAR_SAMPLE *sample; PIXROW_LIST *pixrow_list; PIXROW_IT pixrow_it; IMAGELINE *imlines; // lines of the image TBOX pix_box; // box of imlines // extent WERD copy_outword; // copy to denorm PBLOB_IT copy_blob_it; OUTLINE_IT copy_outline_it; inT32 resolution = page_image.get_res (); if (tessedit_reject_ems || tessedit_reject_suspect_ems) return; // Do nothing if (word->word->bounding_box ().height () > resolution / 3) return; if (tessedit_demo_adaption) // Make sure not set tessedit_display_mm.set_value (FALSE); if (word_adaptable (word, tessedit_em_adaption_mode) && word->reject_map.reject_count () == 0 && (strchr (word->best_choice->string ().string (), 'm') != NULL || (tessedit_process_rns && strstr (word->best_choice->string ().string (), "rn") != NULL))) { if (tessedit_process_rns && strstr (word->best_choice->string ().string (), "rn") != NULL) { copy_outword = *(word->outword); copy_blob_it.set_to_list (copy_outword.blob_list ()); i = 0; while (word->best_choice->string ()[i] != '\0') { if (word->best_choice->string ()[i] == 'r' && word->best_choice->string ()[i + 1] == 'n') { copy_outline_it.set_to_list (copy_blob_it.data ()-> out_list ()); copy_outline_it.add_list_after (copy_blob_it. data_relative (1)-> out_list ()); copy_blob_it.forward (); delete (copy_blob_it.extract ()); i++; } copy_blob_it.forward (); i++; } } else copy_outword = *(word->outword); copy_outword.baseline_denormalise (&word->denorm); char_clip_word(©_outword, page_image, pixrow_list, imlines, pix_box); pixrow_it.set_to_list (pixrow_list); pixrow_it.move_to_first (); blob_it.move_to_first (); for (i = 0; word->best_choice->string ()[i] != '\0'; i++, pixrow_it.forward (), blob_it.forward ()) { if (word->best_choice->string ()[i] == 'm' || (word->best_choice->string ()[i] == 'r' && word->best_choice->string ()[i + 1] == 'n')) { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample %c for adaption found in %s, index %d\n", word->best_choice->string ()[i], word->best_choice->string ().string (), i); #endif if (tessedit_matrix_match) { sample = clip_sample (pixrow_it.data (), imlines, pix_box, copy_outword.flag (W_INVERSE), word->best_choice->string ()[i]); if (sample == NULL) { //Clip failed #ifndef SECURE_NAMES tprintf ("Unable to clip sample from %s, index %d\n", word->best_choice->string ().string (), i); #endif if (word->best_choice->string ()[i] == 'r') i++; continue; } } else sample = new CHAR_SAMPLE (blob_it.data (), &word->denorm, word->best_choice->string ()[i]); cluster_sample(sample, char_clusters, chars_waiting); if (word->best_choice->string ()[i] == 'r') i++; // Skip next character } } delete[]imlines; // Free array of imlines delete pixrow_list; } }
// Creates and returns a Pix with the same resolution as the original // in which 1 (black) pixels represent likely non text (photo, line drawing) // areas of the page, deleting from the blob_block the blobs that were // determined to be non-text. // The photo_map is used to bias the decision towards non-text, rather than // supplying definite decision. // The blob_block is the usual result of connected component analysis, // holding the detected blobs. // The returned Pix should be PixDestroyed after use. Pix* CCNonTextDetect::ComputeNonTextMask(bool debug, Pix* photo_map, TO_BLOCK* blob_block) { // Insert the smallest blobs into the grid. InsertBlobList(&blob_block->small_blobs); InsertBlobList(&blob_block->noise_blobs); // Add the medium blobs that don't have a good strokewidth neighbour. // Those that do go into good_grid as an antidote to spreading beyond the // real reaches of a noise region. BlobGrid good_grid(gridsize(), bleft(), tright()); BLOBNBOX_IT blob_it(&blob_block->blobs); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { BLOBNBOX* blob = blob_it.data(); double perimeter_area_ratio = blob->cblob()->perimeter() / 4.0; perimeter_area_ratio *= perimeter_area_ratio / blob->enclosed_area(); if (blob->GoodTextBlob() == 0 || perimeter_area_ratio < kMinGoodTextPARatio) InsertBBox(true, true, blob); else good_grid.InsertBBox(true, true, blob); } noise_density_ = ComputeNoiseDensity(debug, photo_map, &good_grid); good_grid.Clear(); // Not needed any more. Pix* pix = noise_density_->ThresholdToPix(max_noise_count_); if (debug) { pixWrite("junknoisemask.png", pix, IFF_PNG); } ScrollView* win = NULL; #ifndef GRAPHICS_DISABLED if (debug) { win = MakeWindow(0, 400, "Photo Mask Blobs"); } #endif // GRAPHICS_DISABLED // Large and medium blobs are not text if they overlap with "a lot" of small // blobs. MarkAndDeleteNonTextBlobs(&blob_block->large_blobs, kMaxLargeOverlapsWithSmall, win, ScrollView::DARK_GREEN, pix); MarkAndDeleteNonTextBlobs(&blob_block->blobs, kMaxMediumOverlapsWithSmall, win, ScrollView::WHITE, pix); // Clear the grid of small blobs and insert the medium blobs. Clear(); InsertBlobList(&blob_block->blobs); MarkAndDeleteNonTextBlobs(&blob_block->large_blobs, kMaxLargeOverlapsWithMedium, win, ScrollView::DARK_GREEN, pix); // Clear again before we start deleting the blobs in the grid. Clear(); MarkAndDeleteNonTextBlobs(&blob_block->noise_blobs, -1, win, ScrollView::CORAL, pix); MarkAndDeleteNonTextBlobs(&blob_block->small_blobs, -1, win, ScrollView::GOLDENROD, pix); MarkAndDeleteNonTextBlobs(&blob_block->blobs, -1, win, ScrollView::WHITE, pix); if (debug) { #ifndef GRAPHICS_DISABLED win->Update(); #endif // GRAPHICS_DISABLED pixWrite("junkccphotomask.png", pix, IFF_PNG); #ifndef GRAPHICS_DISABLED delete win->AwaitEvent(SVET_DESTROY); delete win; #endif // GRAPHICS_DISABLED } return pix; }
void collect_characters_for_adaption(WERD_RES *word, CHAR_SAMPLES_LIST *char_clusters, CHAR_SAMPLE_LIST *chars_waiting) { PBLOB_LIST *blobs = word->outword->blob_list (); PBLOB_IT blob_it(blobs); inT16 i; CHAR_SAMPLE *sample; PIXROW_LIST *pixrow_list; PIXROW_IT pixrow_it; IMAGELINE *imlines; // lines of the image TBOX pix_box; // box of imlines // extent WERD copy_outword; // copy to denorm inT32 resolution = page_image.get_res (); if (word->word->bounding_box ().height () > resolution / 3) return; if (tessedit_demo_adaption) // Make sure not set tessedit_display_mm.set_value (FALSE); if ((word_adaptable (word, tessedit_cluster_adaption_mode) && word->reject_map.reject_count () == 0) || tessedit_mm_use_rejmap) { if (tessedit_test_cluster_input && !tessedit_mm_use_rejmap) return; // Reject map set to acceptable /* Collect information about good matches */ copy_outword = *(word->outword); copy_outword.baseline_denormalise (&word->denorm); char_clip_word(©_outword, page_image, pixrow_list, imlines, pix_box); pixrow_it.set_to_list (pixrow_list); pixrow_it.move_to_first (); blob_it.move_to_first (); for (i = 0; word->best_choice->string ()[i] != '\0'; i++, pixrow_it.forward (), blob_it.forward ()) { if (!(tessedit_mm_use_non_adaption_set && STRING (tessedit_non_adaption_set).contains (word-> best_choice-> string ()[i])) || (tessedit_mm_use_rejmap && word->reject_map[i].accepted ())) { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample %c for adaption found in %s, index %d\n", word->best_choice->string ()[i], word->best_choice->string ().string (), i); #endif sample = clip_sample (pixrow_it.data (), imlines, pix_box, copy_outword.flag (W_INVERSE), word->best_choice->string ()[i]); if (sample == NULL) { //Clip failed #ifndef SECURE_NAMES tprintf ("Unable to clip sample from %s, index %d\n", word->best_choice->string ().string (), i); #endif continue; } cluster_sample(sample, char_clusters, chars_waiting); } } delete[]imlines; // Free array of imlines delete pixrow_list; } else if (tessedit_test_cluster_input && !tessedit_mm_use_rejmap) // Set word to all rejects word->reject_map.rej_word_tess_failure (); }
void adapt_to_good_samples(WERD_RES *word, CHAR_SAMPLES_LIST *char_clusters, CHAR_SAMPLE_LIST *chars_waiting) { PBLOB_LIST *blobs = word->outword->blob_list (); PBLOB_IT blob_it(blobs); inT16 i; CHAR_SAMPLE *sample; CHAR_SAMPLES_IT c_it = char_clusters; CHAR_SAMPLE_IT cw_it = chars_waiting; float score; float best_score; char best_char; CHAR_SAMPLES *best_cluster; PIXROW_LIST *pixrow_list; PIXROW_IT pixrow_it; IMAGELINE *imlines; // lines of the image TBOX pix_box; // box of imlines // extent WERD copy_outword; // copy to denorm TBOX b_box; PBLOB_IT copy_blob_it; PIXROW *pixrow = NULL; static inT32 word_number = 0; #ifndef GRAPHICS_DISABLED ScrollView* demo_win = NULL; #endif inT32 resolution = page_image.get_res (); word_number++; if (tessedit_test_cluster_input) return; if (word->word->bounding_box ().height () > resolution / 3) return; if (char_clusters->length () == 0) { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("No clusters to use for adaption\n"); #endif return; } if (!cw_it.empty ()) { complete_clustering(char_clusters, chars_waiting); print_em_stats(char_clusters, chars_waiting); } if ((!word_adaptable (word, tessedit_cluster_adaption_mode) && word->reject_map.reject_count () != 0) || tessedit_mm_use_rejmap) { if (tessedit_cluster_debug) { tprintf ("\nChecking: \"%s\" MAP ", word->best_choice->string ().string ()); word->reject_map.print (debug_fp); tprintf ("\n"); } copy_outword = *(word->outword); copy_outword.baseline_denormalise (&word->denorm); copy_blob_it.set_to_list (copy_outword.blob_list ()); char_clip_word(©_outword, page_image, pixrow_list, imlines, pix_box); pixrow_it.set_to_list (pixrow_list); pixrow_it.move_to_first (); // For debugging only b_box = copy_outword.bounding_box (); pixrow = pixrow_it.data (); blob_it.move_to_first (); copy_blob_it.move_to_first (); for (i = 0; word->best_choice->string ()[i] != '\0'; i++, pixrow_it.forward (), blob_it.forward (), copy_blob_it.forward ()) { if (word->reject_map[i].recoverable () || (tessedit_mm_all_rejects && word->reject_map[i].rejected ())) { TBOX copy_box = copy_blob_it.data ()->bounding_box (); if (tessedit_cluster_debug) tprintf ("Sample %c to check found in %s, index %d\n", word->best_choice->string ()[i], word->best_choice->string ().string (), i); if (tessedit_demo_adaption) tprintf ("Sample %c to check found in %s (%d), index %d\n", word->best_choice->string ()[i], word->best_choice->string ().string (), word_number, i); sample = clip_sample (pixrow_it.data (), imlines, pix_box, copy_outword.flag (W_INVERSE), word->best_choice->string ()[i]); if (sample == NULL) { //Clip failed tprintf ("Unable to clip sample from %s, index %d\n", word->best_choice->string ().string (), i); #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample rejected (no sample)\n"); #endif word->reject_map[i].setrej_mm_reject (); continue; } best_score = MAX_INT32; best_char = '\0'; best_cluster = NULL; for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) { if (c_it.data ()->character () != '\0') { score = c_it.data ()->match_score (sample); if (score < best_score) { best_cluster = c_it.data (); best_score = score; best_char = c_it.data ()->character (); } } } if (best_score > tessedit_cluster_t1) { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample rejected (score %f)\n", best_score); if (tessedit_demo_adaption) tprintf ("Sample rejected (score %f)\n", best_score); #endif word->reject_map[i].setrej_mm_reject (); } else { if (word->best_choice->string ()[i] == best_char) { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample accepted (score %f)\n", best_score); if (tessedit_demo_adaption) tprintf ("Sample accepted (score %f)\n", best_score); #endif if (tessedit_test_adaption) word->reject_map[i].setrej_minimal_rej_accept (); else word->reject_map[i].setrej_mm_accept (); } else { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample rejected (char %c, score %f)\n", best_char, best_score); if (tessedit_demo_adaption) tprintf ("Sample rejected (char %c, score %f)\n", best_char, best_score); #endif word->reject_map[i].setrej_mm_reject (); } } if (tessedit_demo_adaption) { if (strcmp (imagebasename.string (), tessedit_demo_file.string ()) != 0 || word_number == tessedit_demo_word1 || word_number == tessedit_demo_word2) { #ifndef GRAPHICS_DISABLED demo_win = display_clip_image(©_outword, page_image, pixrow_list, pix_box); #endif demo_word = word_number; best_cluster->match_score (sample); demo_word = 0; } } } } delete[]imlines; // Free array of imlines delete pixrow_list; if (tessedit_cluster_debug) { tprintf ("\nFinal: \"%s\" MAP ", word->best_choice->string ().string ()); word->reject_map.print (debug_fp); tprintf ("\n"); } } }