float Textord::filter_noise_blobs( BLOBNBOX_LIST *src_list, // original list BLOBNBOX_LIST *noise_list, // noise list BLOBNBOX_LIST *small_list, // small blobs BLOBNBOX_LIST *large_list) { // large blobs int16_t height; //height of blob int16_t width; //of blob BLOBNBOX *blob; //current blob float initial_x; //first guess BLOBNBOX_IT src_it = src_list; //iterators BLOBNBOX_IT noise_it = noise_list; BLOBNBOX_IT small_it = small_list; BLOBNBOX_IT large_it = large_list; STATS size_stats (0, MAX_NEAREST_DIST); //blob heights float min_y; //size limits float max_y; float max_x; float max_height; //of good blobs for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { blob = src_it.data(); if (blob->bounding_box().height() < textord_max_noise_size) noise_it.add_after_then_move(src_it.extract()); else if (blob->enclosed_area() >= blob->bounding_box().height() * blob->bounding_box().width() * textord_noise_area_ratio) small_it.add_after_then_move(src_it.extract()); } for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { size_stats.add(src_it.data()->bounding_box().height(), 1); } initial_x = size_stats.ile(textord_initialx_ile); max_y = ceil(initial_x * (tesseract::CCStruct::kDescenderFraction + tesseract::CCStruct::kXHeightFraction + 2 * tesseract::CCStruct::kAscenderFraction) / tesseract::CCStruct::kXHeightFraction); min_y = floor (initial_x / 2); max_x = ceil (initial_x * textord_width_limit); small_it.move_to_first (); for (small_it.mark_cycle_pt (); !small_it.cycled_list (); small_it.forward ()) { height = small_it.data()->bounding_box().height(); if (height > max_y) large_it.add_after_then_move(small_it.extract ()); else if (height >= min_y) src_it.add_after_then_move(small_it.extract ()); } size_stats.clear (); for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) { height = src_it.data ()->bounding_box ().height (); width = src_it.data ()->bounding_box ().width (); if (height < min_y) small_it.add_after_then_move (src_it.extract ()); else if (height > max_y || width > max_x) large_it.add_after_then_move (src_it.extract ()); else size_stats.add (height, 1); } max_height = size_stats.ile (textord_initialasc_ile); // tprintf("max_y=%g, min_y=%g, initial_x=%g, max_height=%g,", // max_y,min_y,initial_x,max_height); max_height *= tesseract::CCStruct::kXHeightCapRatio; if (max_height > initial_x) initial_x = max_height; // tprintf(" ret=%g\n",initial_x); return initial_x; }
// Creates and returns a Pix with the same resolution as the original // in which 1 (black) pixels represent likely non text (photo, line drawing) // areas of the page, deleting from the blob_block the blobs that were // determined to be non-text. // The photo_map is used to bias the decision towards non-text, rather than // supplying definite decision. // The blob_block is the usual result of connected component analysis, // holding the detected blobs. // The returned Pix should be PixDestroyed after use. Pix* CCNonTextDetect::ComputeNonTextMask(bool debug, Pix* photo_map, TO_BLOCK* blob_block) { // Insert the smallest blobs into the grid. InsertBlobList(&blob_block->small_blobs); InsertBlobList(&blob_block->noise_blobs); // Add the medium blobs that don't have a good strokewidth neighbour. // Those that do go into good_grid as an antidote to spreading beyond the // real reaches of a noise region. BlobGrid good_grid(gridsize(), bleft(), tright()); BLOBNBOX_IT blob_it(&blob_block->blobs); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { BLOBNBOX* blob = blob_it.data(); double perimeter_area_ratio = blob->cblob()->perimeter() / 4.0; perimeter_area_ratio *= perimeter_area_ratio / blob->enclosed_area(); if (blob->GoodTextBlob() == 0 || perimeter_area_ratio < kMinGoodTextPARatio) InsertBBox(true, true, blob); else good_grid.InsertBBox(true, true, blob); } noise_density_ = ComputeNoiseDensity(debug, photo_map, &good_grid); good_grid.Clear(); // Not needed any more. Pix* pix = noise_density_->ThresholdToPix(max_noise_count_); if (debug) { pixWrite("junknoisemask.png", pix, IFF_PNG); } ScrollView* win = NULL; #ifndef GRAPHICS_DISABLED if (debug) { win = MakeWindow(0, 400, "Photo Mask Blobs"); } #endif // GRAPHICS_DISABLED // Large and medium blobs are not text if they overlap with "a lot" of small // blobs. MarkAndDeleteNonTextBlobs(&blob_block->large_blobs, kMaxLargeOverlapsWithSmall, win, ScrollView::DARK_GREEN, pix); MarkAndDeleteNonTextBlobs(&blob_block->blobs, kMaxMediumOverlapsWithSmall, win, ScrollView::WHITE, pix); // Clear the grid of small blobs and insert the medium blobs. Clear(); InsertBlobList(&blob_block->blobs); MarkAndDeleteNonTextBlobs(&blob_block->large_blobs, kMaxLargeOverlapsWithMedium, win, ScrollView::DARK_GREEN, pix); // Clear again before we start deleting the blobs in the grid. Clear(); MarkAndDeleteNonTextBlobs(&blob_block->noise_blobs, -1, win, ScrollView::CORAL, pix); MarkAndDeleteNonTextBlobs(&blob_block->small_blobs, -1, win, ScrollView::GOLDENROD, pix); MarkAndDeleteNonTextBlobs(&blob_block->blobs, -1, win, ScrollView::WHITE, pix); if (debug) { #ifndef GRAPHICS_DISABLED win->Update(); #endif // GRAPHICS_DISABLED pixWrite("junkccphotomask.png", pix, IFF_PNG); #ifndef GRAPHICS_DISABLED delete win->AwaitEvent(SVET_DESTROY); delete win; #endif // GRAPHICS_DISABLED } return pix; }