Esempio n. 1
0
// Helper to compute edge offsets for  all the blobs on the list.
// See coutln.h for an explanation of edge offsets.
void BLOBNBOX::ComputeEdgeOffsets(Pix* thresholds, Pix* grey,
                                  BLOBNBOX_LIST* blobs) {
  int grey_height = 0;
  int thr_height = 0;
  int scale_factor = 1;
  if (thresholds != NULL && grey != NULL) {
    grey_height = pixGetHeight(grey);
    thr_height = pixGetHeight(thresholds);
    scale_factor =
        IntCastRounded(static_cast<double>(grey_height) / thr_height);
  }
  BLOBNBOX_IT blob_it(blobs);
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
    BLOBNBOX* blob = blob_it.data();
    if (blob->cblob() != NULL) {
      // Get the threshold that applies to this blob.
      l_uint32 threshold = 128;
      if (thresholds != NULL && grey != NULL) {
        const TBOX& box = blob->cblob()->bounding_box();
        // Transform the coordinates if required.
        TPOINT pt((box.left() + box.right()) / 2,
                  (box.top() + box.bottom()) / 2);
        pixGetPixel(thresholds, pt.x / scale_factor,
                    thr_height - 1 - pt.y / scale_factor, &threshold);
      }
      blob->cblob()->ComputeEdgeOffsets(threshold, grey);
    }
  }
}
Esempio n. 2
0
// Reorganize the blob lists with a different definition of small, medium
// and large, compared to the original definition.
// Height is still the primary filter key, but medium width blobs of small
// height become small, and very wide blobs of small height stay noise, along
// with small dot-shaped blobs.
void StrokeWidth::ReFilterBlobs(TO_BLOCK* block) {
  int min_height =
    static_cast<int>(textord_strokewidth_minsize * block->line_size + 0.5);
  int max_height =
    static_cast<int>(textord_strokewidth_maxsize * block->line_size + 0.5);
  BLOBNBOX_LIST noise_list;
  BLOBNBOX_LIST small_list;
  BLOBNBOX_LIST medium_list;
  BLOBNBOX_LIST large_list;
  SizeFilterBlobs(min_height, max_height, &block->blobs,
                  &noise_list, &small_list, &medium_list, &large_list);
  SizeFilterBlobs(min_height, max_height, &block->large_blobs,
                  &noise_list, &small_list, &medium_list, &large_list);
  SizeFilterBlobs(min_height, max_height, &block->small_blobs,
                  &noise_list, &small_list, &medium_list, &large_list);
  SizeFilterBlobs(min_height, max_height, &block->noise_blobs,
                  &noise_list, &small_list, &medium_list, &large_list);
  BLOBNBOX_IT blob_it(&block->blobs);
  blob_it.add_list_after(&medium_list);
  blob_it.set_to_list(&block->large_blobs);
  blob_it.add_list_after(&large_list);
  blob_it.set_to_list(&block->small_blobs);
  blob_it.add_list_after(&small_list);
  blob_it.set_to_list(&block->noise_blobs);
  blob_it.add_list_after(&noise_list);
}
Esempio n. 3
0
// Converts the Boxa array to a list of C_BLOB, getting rid of severely
// overlapping outlines and those that are children of a bigger one.
// The output is a list of C_BLOBs that are owned by the list.
// The C_OUTLINEs in the C_BLOBs contain no outline data - just empty
// bounding boxes. The Boxa is consumed and destroyed.
void LineFinder::ConvertBoxaToBlobs(int image_width, int image_height,
                                    Boxa** boxes, C_BLOB_LIST* blobs) {
#ifdef HAVE_LIBLEPT
  C_OUTLINE_LIST outlines;
  C_OUTLINE_IT ol_it = &outlines;
  // Iterate the boxes to convert to outlines.
  int nboxes = boxaGetCount(*boxes);
  for (int i = 0; i < nboxes; ++i) {
    l_int32 x, y, width, height;
    boxaGetBoxGeometry(*boxes, i, &x, &y, &width, &height);
    // Make a C_OUTLINE from the leptonica box. This is a bit of a hack,
    // as there is no outline, just a bounding box, but with some very
    // small changes to coutln.cpp, it works nicely.
    ICOORD top_left(x, image_height - y);
    ICOORD bot_right(x + width, image_height - (y + height));
    CRACKEDGE startpt;
    startpt.pos = top_left;
    C_OUTLINE* outline = new C_OUTLINE(&startpt, top_left, bot_right, 0);
    ol_it.add_after_then_move(outline);
  }
  // Use outlines_to_blobs to convert the outlines to blobs and find
  // overlapping and contained objects. The output list of blobs in the block
  // has all the bad ones filtered out and deleted.
  BLOCK block;
  ICOORD page_tl(0, 0);
  ICOORD page_br(image_width, image_height);
  outlines_to_blobs(&block, page_tl, page_br, &outlines);
  // Transfer the created blobs to the output list.
  C_BLOB_IT blob_it(blobs);
  blob_it.add_list_after(block.blob_list());
  // The boxes aren't needed any more.
  boxaDestroy(boxes);
#endif
}
Esempio n. 4
0
// Remove outlines that are a tiny fraction in either width or height
// of the word height.
void Textord::clean_small_noise_from_words(ROW *row) {
  WERD_IT word_it(row->word_list());
  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
    WERD* word = word_it.data();
    int min_size = static_cast<int>(
      textord_noise_hfract * word->bounding_box().height() + 0.5);
    C_BLOB_IT blob_it(word->cblob_list());
    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
      C_BLOB* blob = blob_it.data();
      C_OUTLINE_IT out_it(blob->out_list());
      for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
        C_OUTLINE* outline = out_it.data();
        outline->RemoveSmallRecursive(min_size, &out_it);
      }
      if (blob->out_list()->empty()) {
        delete blob_it.extract();
      }
    }
    if (word->cblob_list()->empty()) {
      if (!word_it.at_last()) {
        // The next word is no longer a fuzzy non space if it was before,
        // since the word before is about to be deleted.
        WERD* next_word = word_it.data_relative(1);
        if (next_word->flag(W_FUZZY_NON)) {
          next_word->set_flag(W_FUZZY_NON, false);
        }
      }
      delete word_it.extract();
    }
  }
}
Esempio n. 5
0
PAGE_RES_IT* make_pseudo_word(PAGE_RES* page_res, const TBOX& selection_box) {
  PAGE_RES_IT pr_it(page_res);
  C_BLOB_LIST new_blobs;               // list of gathered blobs
  C_BLOB_IT new_blob_it = &new_blobs;  // iterator

  for (WERD_RES* word_res = pr_it.word(); word_res != NULL;
       word_res = pr_it.forward()) {
    WERD* word = word_res->word;
    if (word->bounding_box().overlap(selection_box)) {
      C_BLOB_IT blob_it(word->cblob_list());
      for (blob_it.mark_cycle_pt();
           !blob_it.cycled_list(); blob_it.forward()) {
        C_BLOB* blob = blob_it.data();
        if (blob->bounding_box().overlap(selection_box)) {
          new_blob_it.add_after_then_move(C_BLOB::deep_copy(blob));
        }
      }
      if (!new_blobs.empty()) {
        WERD* pseudo_word = new WERD(&new_blobs, 1, NULL);
        word_res = pr_it.InsertSimpleCloneWord(*word_res, pseudo_word);
        PAGE_RES_IT* it = new PAGE_RES_IT(page_res);
        while (it->word() != word_res && it->word() != NULL) it->forward();
        ASSERT_HOST(it->word() == word_res);
        return it;
      }
    }
  }
  return NULL;
}
Esempio n. 6
0
// Computes the noise_density_ by summing the number of elements in a
// neighbourhood of each grid cell.
void StrokeWidth::ComputeNoiseDensity(TO_BLOCK* block, TabFind* line_grid) {
  // Run a preliminary strokewidth neighbour detection on the medium blobs.
  line_grid->InsertBlobList(true, true, false, &block->blobs, false, this);
  BLOBNBOX_IT blob_it(&block->blobs);
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
    SetNeighbours(false, blob_it.data());
  }
  // Remove blobs with a good strokewidth neighbour from the grid.
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
    BLOBNBOX* blob = blob_it.data();
    if (blob->GoodTextBlob() > 0)
      RemoveBBox(blob);
    blob->ClearNeighbours();
  }
  // Insert the smaller blobs into the grid.
  line_grid->InsertBlobList(true, true, false, &block->small_blobs,
                            false, this);
  line_grid->InsertBlobList(true, true, false, &block->noise_blobs,
                            false, this);
  if (noise_density_ != NULL)
    delete noise_density_;
  IntGrid* cell_counts = CountCellElements();
  noise_density_ = cell_counts->NeighbourhoodSum();
  delete cell_counts;
  // Clear the grid as we don't want the small stuff hanging around in it.
  Clear();
}
// Inserts a list of blobs into the projection.
// Rotation is a multiple of 90 degrees to get from blob coords to
// nontext_map coords, nontext_map_box is the bounds of the nontext_map.
// Blobs are spread horizontally or vertically according to their internal
// flags, but the spreading is truncated by set pixels in the nontext_map
// and also by the horizontal rule line limits on the blobs.
void TextlineProjection::ProjectBlobs(BLOBNBOX_LIST* blobs,
                                      const FCOORD& rotation,
                                      const TBOX& nontext_map_box,
                                      Pix* nontext_map) {
  BLOBNBOX_IT blob_it(blobs);
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
    BLOBNBOX* blob = blob_it.data();
    TBOX bbox = blob->bounding_box();
    ICOORD middle((bbox.left() + bbox.right()) / 2,
                  (bbox.bottom() + bbox.top()) / 2);
    bool spreading_horizontally = PadBlobBox(blob, &bbox);
    // Rotate to match the nontext_map.
    bbox.rotate(rotation);
    middle.rotate(rotation);
    if (rotation.x() == 0.0f)
      spreading_horizontally = !spreading_horizontally;
    // Clip to the image before applying the increments.
    bbox &= nontext_map_box;  // This is in-place box intersection.
    // Check for image pixels before spreading.
    TruncateBoxToMissNonText(middle.x(), middle.y(), spreading_horizontally,
                             nontext_map, &bbox);
    if (bbox.area() > 0) {
      IncrementRectangle8Bit(bbox);
    }
  }
}
Esempio n. 8
0
// Inserts all the blobs from the given list, with x and y spreading,
// without removing from the source list, so ownership remains with the
// source list.
    void BlobGrid::InsertBlobList(BLOBNBOX_LIST * blobs) {
        BLOBNBOX_IT blob_it(blobs);
        for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
            BLOBNBOX *blob = blob_it.data();
            if (!blob->joined_to_prev())
                InsertBBox(true, true, blob);
        }
    }
BOOL8 suspect_em(WERD_RES *word, inT16 index) {
  PBLOB_LIST *blobs = word->outword->blob_list ();
  PBLOB_IT blob_it(blobs);
  inT16 j;

  for (j = 0; j < index; j++)
    blob_it.forward ();

  return (blob_it.data ()->out_list ()->length () != 1);
}
Esempio n. 10
0
// Helper to delete all the deletable blobs on the list.
void BLOBNBOX::DeleteNoiseBlobs(BLOBNBOX_LIST* blobs) {
  BLOBNBOX_IT blob_it(blobs);
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
    BLOBNBOX* blob = blob_it.data();
    if (blob->DeletableNoise()) {
      delete blob->cblob();
      delete blob_it.extract();
    }
  }
}
Esempio n. 11
0
// Finds horizontal line objects in the given pix.
// Uses the given resolution to determine size thresholds instead of any
// that may be present in the pix.
// The output vectors are owned by the list and Frozen (cannot refit) by
// having no boxes, as there is no need to refit or merge separator lines.
void LineFinder::FindHorizontalLines(int resolution,  Pix* pix,
                                     TabVector_LIST* vectors) {
#ifdef HAVE_LIBLEPT
  Pix* line_pix;
  Boxa* boxes = GetHLineBoxes(resolution, pix, &line_pix);
  C_BLOB_LIST line_cblobs;
  int width = pixGetWidth(pix);
  int height = pixGetHeight(pix);
  ConvertBoxaToBlobs(height, width, &boxes, &line_cblobs);
  // Make the BLOBNBOXes from the C_BLOBs.
  BLOBNBOX_LIST line_bblobs;
  C_BLOB_IT blob_it(&line_cblobs);
  BLOBNBOX_IT bbox_it(&line_bblobs);
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
    C_BLOB* cblob = blob_it.data();
    BLOBNBOX* bblob = new BLOBNBOX(cblob);
    bbox_it.add_to_end(bblob);
  }
  ICOORD bleft(0, 0);
  ICOORD tright(height, width);
  int vertical_x, vertical_y;
  FindLineVectors(bleft, tright, &line_bblobs, &vertical_x, &vertical_y,
                  vectors);
  if (!vectors->empty()) {
    // Some lines were found, so erase the unused blobs from the line image
    // and then subtract the line image from the source.
    bbox_it.move_to_first();
    for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) {
      BLOBNBOX* blob = bbox_it.data();
      if (blob->left_tab_type() == TT_UNCONFIRMED) {
        const TBOX& box = blob->bounding_box();
        // Coords are in tess format so filp x and y and then covert
        // to leptonica by height -y.
        Box* pixbox = boxCreate(box.bottom(), height - box.right(),
                                box.height(), box.width());
        pixClearInRect(line_pix, pixbox);
        boxDestroy(&pixbox);
      }
    }
    pixDilateBrick(line_pix, line_pix, 3, 1);
    pixSubtract(pix, pix, line_pix);
    if (textord_tabfind_show_vlines)
      pixWrite("hlinesclean.png", line_pix, IFF_PNG);
    ICOORD vertical;
    vertical.set_with_shrink(vertical_x, vertical_y);
    TabVector::MergeSimilarTabVectors(vertical, vectors, NULL);
    // Iterate the vectors to flip them.
    TabVector_IT h_it(vectors);
    for (h_it.mark_cycle_pt(); !h_it.cycled_list(); h_it.forward()) {
      h_it.data()->XYFlip();
    }
  }
  pixDestroy(&line_pix);
#endif
}
Esempio n. 12
0
// Tests each blob in the list to see if it is certain non-text using 2
// conditions:
// 1. blob overlaps a cell with high value in noise_density_ (previously set
// by ComputeNoiseDensity).
// OR 2. The blob overlaps more than max_blob_overlaps in *this grid. This
// condition is disabled with max_blob_overlaps == -1.
// If it does, the blob is declared non-text, and is used to mark up the
// nontext_mask. Such blobs are fully deleted, and non-noise blobs have their
// neighbours reset, as they may now point to deleted data.
// WARNING: The blobs list blobs may be in the *this grid, but they are
// not removed. If any deleted blobs might be in *this, then this must be
// Clear()ed immediately after MarkAndDeleteNonTextBlobs is called.
// If the win is not NULL, deleted blobs are drawn on it in red, and kept
// blobs are drawn on it in ok_color.
void CCNonTextDetect::MarkAndDeleteNonTextBlobs(BLOBNBOX_LIST* blobs,
        int max_blob_overlaps,
        ScrollView* win,
        ScrollView::Color ok_color,
        Pix* nontext_mask) {
    int imageheight = tright().y() - bleft().x();
    BLOBNBOX_IT blob_it(blobs);
    BLOBNBOX_LIST dead_blobs;
    BLOBNBOX_IT dead_it(&dead_blobs);
    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
        BLOBNBOX* blob = blob_it.data();
        TBOX box = blob->bounding_box();
        if (!noise_density_->RectMostlyOverThreshold(box, max_noise_count_) &&
                (max_blob_overlaps < 0 ||
                 !BlobOverlapsTooMuch(blob, max_blob_overlaps))) {
            blob->ClearNeighbours();
#ifndef GRAPHICS_DISABLED
            if (win != NULL)
                blob->plot(win, ok_color, ok_color);
#endif  // GRAPHICS_DISABLED
        } else {
            if (noise_density_->AnyZeroInRect(box)) {
                // There is a danger that the bounding box may overlap real text, so
                // we need to render the outline.
                Pix* blob_pix = blob->cblob()->render_outline();
                pixRasterop(nontext_mask, box.left(), imageheight - box.top(),
                            box.width(), box.height(), PIX_SRC | PIX_DST,
                            blob_pix, 0, 0);
                pixDestroy(&blob_pix);
            } else {
                if (box.area() < gridsize() * gridsize()) {
                    // It is a really bad idea to make lots of small components in the
                    // photo mask, so try to join it to a bigger area by expanding the
                    // box in a way that does not touch any zero noise density cell.
                    box = AttemptBoxExpansion(box, *noise_density_, gridsize());
                }
                // All overlapped cells are non-zero, so just mark the rectangle.
                pixRasterop(nontext_mask, box.left(), imageheight - box.top(),
                            box.width(), box.height(), PIX_SET, NULL, 0, 0);
            }
#ifndef GRAPHICS_DISABLED
            if (win != NULL)
                blob->plot(win, ScrollView::RED, ScrollView::RED);
#endif  // GRAPHICS_DISABLED
            // It is safe to delete the cblob now, as it isn't used by the grid
            // or BlobOverlapsTooMuch, and the BLOBNBOXes will go away with the
            // dead_blobs list.
            // TODO(rays) delete the delete when the BLOBNBOX destructor deletes
            // the cblob.
            delete blob->cblob();
            dead_it.add_to_end(blob_it.extract());
        }
    }
}
Esempio n. 13
0
// Adds the selected outlines to the indcated real blobs, and puts the rest
// back in rej_cblobs where they came from. Where the target_blobs entry is
// nullptr, a run of wanted outlines is put into a single new blob.
// Ownership of the outlines is transferred back to the word. (Hence
// GenericVector and not PointerVector.)
// Returns true if any new blob was added to the start of the word, which
// suggests that it might need joining to the word before it, and likewise
// sets make_next_word_fuzzy true if any new blob was added to the end.
bool WERD::AddSelectedOutlines(const GenericVector<bool>& wanted,
                               const GenericVector<C_BLOB*>& target_blobs,
                               const GenericVector<C_OUTLINE*>& outlines,
                               bool* make_next_word_fuzzy) {
  bool outline_added_to_start = false;
  if (make_next_word_fuzzy != nullptr) *make_next_word_fuzzy = false;
  C_BLOB_IT rej_it(&rej_cblobs);
  for (int i = 0; i < outlines.size(); ++i) {
    C_OUTLINE* outline = outlines[i];
    if (outline == nullptr) continue;  // Already used it.
    if (wanted[i]) {
      C_BLOB* target_blob = target_blobs[i];
      TBOX noise_box = outline->bounding_box();
      if (target_blob == nullptr) {
        target_blob = new C_BLOB(outline);
        // Need to find the insertion point.
        C_BLOB_IT blob_it(&cblobs);
        for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
             blob_it.forward()) {
          C_BLOB* blob = blob_it.data();
          TBOX blob_box = blob->bounding_box();
          if (blob_box.left() > noise_box.left()) {
            if (blob_it.at_first() && !flag(W_FUZZY_SP) && !flag(W_FUZZY_NON)) {
              // We might want to join this word to its predecessor.
              outline_added_to_start = true;
            }
            blob_it.add_before_stay_put(target_blob);
            break;
          }
        }
        if (blob_it.cycled_list()) {
          blob_it.add_to_end(target_blob);
          if (make_next_word_fuzzy != nullptr) *make_next_word_fuzzy = true;
        }
        // Add all consecutive wanted, but null-blob outlines to same blob.
        C_OUTLINE_IT ol_it(target_blob->out_list());
        while (i + 1 < outlines.size() && wanted[i + 1] &&
               target_blobs[i + 1] == nullptr) {
          ++i;
          ol_it.add_to_end(outlines[i]);
        }
      } else {
        // Insert outline into this blob.
        C_OUTLINE_IT ol_it(target_blob->out_list());
        ol_it.add_to_end(outline);
      }
    } else {
      // Put back on noise list.
      rej_it.add_to_end(new C_BLOB(outline));
    }
  }
  return outline_added_to_start;
}
Esempio n. 14
0
void WERD::join_on(WERD* other) {
  C_BLOB_IT blob_it(&cblobs);
  C_BLOB_IT src_it(&other->cblobs);
  C_BLOB_IT rej_cblob_it(&rej_cblobs);
  C_BLOB_IT src_rej_it(&other->rej_cblobs);

  while (!src_it.empty()) {
    blob_it.add_to_end(src_it.extract());
    src_it.forward();
  }
  while (!src_rej_it.empty()) {
    rej_cblob_it.add_to_end(src_rej_it.extract());
    src_rej_it.forward();
  }
}
Esempio n. 15
0
// Finds vertical line objects in the given pix.
// Uses the given resolution to determine size thresholds instead of any
// that may be present in the pix.
// The output vertical_x and vertical_y contain a sum of the output vectors,
// thereby giving the mean vertical direction.
// The output vectors are owned by the list and Frozen (cannot refit) by
// having no boxes, as there is no need to refit or merge separator lines.
void LineFinder::FindVerticalLines(int resolution,  Pix* pix,
                                   int* vertical_x, int* vertical_y,
                                   TabVector_LIST* vectors) {
#ifdef HAVE_LIBLEPT
  Pix* line_pix;
  Boxa* boxes = GetVLineBoxes(resolution, pix, &line_pix);
  C_BLOB_LIST line_cblobs;
  int width = pixGetWidth(pix);
  int height = pixGetHeight(pix);
  ConvertBoxaToBlobs(width, height, &boxes, &line_cblobs);
  // Make the BLOBNBOXes from the C_BLOBs.
  BLOBNBOX_LIST line_bblobs;
  C_BLOB_IT blob_it(&line_cblobs);
  BLOBNBOX_IT bbox_it(&line_bblobs);
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
    C_BLOB* cblob = blob_it.data();
    BLOBNBOX* bblob = new BLOBNBOX(cblob);
    bbox_it.add_to_end(bblob);
  }
  ICOORD bleft(0, 0);
  ICOORD tright(width, height);
  FindLineVectors(bleft, tright, &line_bblobs, vertical_x, vertical_y, vectors);
  if (!vectors->empty()) {
    // Some lines were found, so erase the unused blobs from the line image
    // and then subtract the line image from the source.
    bbox_it.move_to_first();
    for (bbox_it.mark_cycle_pt(); !bbox_it.cycled_list(); bbox_it.forward()) {
      BLOBNBOX* blob = bbox_it.data();
      if (blob->left_tab_type() == TT_UNCONFIRMED) {
        const TBOX& box = blob->bounding_box();
        Box* pixbox = boxCreate(box.left(), height - box.top(),
                                box.width(), box.height());
        pixClearInRect(line_pix, pixbox);
        boxDestroy(&pixbox);
      }
    }
    pixDilateBrick(line_pix, line_pix, 1, 3);
    pixSubtract(pix, pix, line_pix);
    if (textord_tabfind_show_vlines)
      pixWrite("vlinesclean.png", line_pix, IFF_PNG);
    ICOORD vertical;
    vertical.set_with_shrink(*vertical_x, *vertical_y);
    TabVector::MergeSimilarTabVectors(vertical, vectors, NULL);
  }
  pixDestroy(&line_pix);
#endif
}
Esempio n. 16
0
// Sets up displacement_modes_ with the top few modes of the perpendicular
// distance of each blob from the given direction vector, after rounding.
void BaselineRow::SetupBlobDisplacements(const FCOORD& direction) {
  // Set of perpendicular displacements of the blob bottoms from the required
  // baseline direction.
  GenericVector<double> perp_blob_dists;
  displacement_modes_.truncate(0);
  // Gather the skew-corrected position of every blob.
  double min_dist = MAX_FLOAT32;
  double max_dist = -MAX_FLOAT32;
  BLOBNBOX_IT blob_it(blobs_);
  bool debug = false;
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
    BLOBNBOX* blob = blob_it.data();
    const TBOX& box = blob->bounding_box();
#ifdef kDebugYCoord
    if (box.bottom() < kDebugYCoord && box.top() > kDebugYCoord) debug = true;
#endif
    FCOORD blob_pos((box.left() + box.right()) / 2.0f,
                    blob->baseline_position());
    double offset = direction * blob_pos;
    perp_blob_dists.push_back(offset);
    if (debug) {
      tprintf("Displacement %g for blob at:", offset);
      box.print();
    }
    UpdateRange(offset, &min_dist, &max_dist);
  }
  // Set up a histogram using disp_quant_factor_ as the bucket size.
  STATS dist_stats(IntCastRounded(min_dist / disp_quant_factor_),
                   IntCastRounded(max_dist / disp_quant_factor_) + 1);
  for (int i = 0; i < perp_blob_dists.size(); ++i) {
    dist_stats.add(IntCastRounded(perp_blob_dists[i] / disp_quant_factor_), 1);
  }
  GenericVector<KDPairInc<float, int> > scaled_modes;
  dist_stats.top_n_modes(kMaxDisplacementsModes, &scaled_modes);
  if (debug) {
    for (int i = 0; i < scaled_modes.size(); ++i) {
      tprintf("Top mode = %g * %d\n",
              scaled_modes[i].key * disp_quant_factor_, scaled_modes[i].data);
    }
  }
  for (int i = 0; i < scaled_modes.size(); ++i)
    displacement_modes_.push_back(disp_quant_factor_ * scaled_modes[i].key);
}
Esempio n. 17
0
// Removes noise from the word by moving small outlines to the rej_cblobs
// list, based on the size_threshold.
void WERD::CleanNoise(float size_threshold) {
  C_BLOB_IT blob_it(&cblobs);
  C_BLOB_IT rej_it(&rej_cblobs);
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
    C_BLOB* blob = blob_it.data();
    C_OUTLINE_IT ol_it(blob->out_list());
    for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) {
      C_OUTLINE* outline = ol_it.data();
      TBOX ol_box = outline->bounding_box();
      int ol_size =
          ol_box.width() > ol_box.height() ? ol_box.width() : ol_box.height();
      if (ol_size < size_threshold) {
        // This outline is too small. Move it to a separate blob in the
        // reject blobs list.
        C_BLOB* rej_blob = new C_BLOB(ol_it.extract());
        rej_it.add_after_then_move(rej_blob);
      }
    }
    if (blob->out_list()->empty()) delete blob_it.extract();
  }
}
Esempio n. 18
0
BOOL8 suspect_fullstop(WERD_RES *word, inT16 i) {
  float aspect_ratio;
  PBLOB_LIST *blobs = word->outword->blob_list ();
  PBLOB_IT blob_it(blobs);
  inT16 j;
  TBOX box;
  inT16 width;
  inT16 height;

  for (j = 0; j < i; j++)
    blob_it.forward ();

  box = blob_it.data ()->bounding_box ();

  width = box.width ();
  height = box.height ();

  aspect_ratio = ((width > height) ? ((float) width) / height :
  ((float) height) / width);

  return (aspect_ratio > tessed_fullstop_aspect_ratio);
}
Esempio n. 19
0
// Fixes the block so it obeys all the rules:
// Must have at least one ROW.
// Must have at least one WERD.
// WERDs contain a fake blob.
void Textord::cleanup_nontext_block(BLOCK* block) {
  // Non-text blocks must contain at least one row.
  ROW_IT row_it(block->row_list());
  if (row_it.empty()) {
    const TBOX& box = block->pdblk.bounding_box();
    float height = box.height();
    int32_t xstarts[2] = {box.left(), box.right()};
    double coeffs[3] = {0.0, 0.0, static_cast<double>(box.bottom())};
    ROW* row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f,
                       height / 4.0f, 0, 1);
    row_it.add_after_then_move(row);
  }
  // Each row must contain at least one word.
  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
    ROW* row = row_it.data();
    WERD_IT w_it(row->word_list());
    if (w_it.empty()) {
      // Make a fake blob to put in the word.
      TBOX box = block->row_list()->singleton() ? block->pdblk.bounding_box()
                                                : row->bounding_box();
      C_BLOB* blob = C_BLOB::FakeBlob(box);
      C_BLOB_LIST blobs;
      C_BLOB_IT blob_it(&blobs);
      blob_it.add_after_then_move(blob);
      WERD* word = new WERD(&blobs, 0, nullptr);
      w_it.add_after_then_move(word);
    }
    // Each word must contain a fake blob.
    for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
      WERD* word = w_it.data();
      // Just assert that this is true, as it would be useful to find
      // out why it isn't.
      ASSERT_HOST(!word->cblob_list()->empty());
    }
    row->recalc_bounding_box();
  }
}
Esempio n. 20
0
// Reorganize the blob lists with a different definition of small, medium
// and large, compared to the original definition.
// Height is still the primary filter key, but medium width blobs of small
// height become small, and very wide blobs of small height stay noise, along
// with small dot-shaped blobs.
void TO_BLOCK::ReSetAndReFilterBlobs() {
  int min_height = IntCastRounded(kMinMediumSizeRatio * line_size);
  int max_height = IntCastRounded(kMaxMediumSizeRatio * line_size);
  BLOBNBOX_LIST noise_list;
  BLOBNBOX_LIST small_list;
  BLOBNBOX_LIST medium_list;
  BLOBNBOX_LIST large_list;
  SizeFilterBlobs(min_height, max_height, &blobs,
                  &noise_list, &small_list, &medium_list, &large_list);
  SizeFilterBlobs(min_height, max_height, &large_blobs,
                  &noise_list, &small_list, &medium_list, &large_list);
  SizeFilterBlobs(min_height, max_height, &small_blobs,
                  &noise_list, &small_list, &medium_list, &large_list);
  SizeFilterBlobs(min_height, max_height, &noise_blobs,
                  &noise_list, &small_list, &medium_list, &large_list);
  BLOBNBOX_IT blob_it(&blobs);
  blob_it.add_list_after(&medium_list);
  blob_it.set_to_list(&large_blobs);
  blob_it.add_list_after(&large_list);
  blob_it.set_to_list(&small_blobs);
  blob_it.add_list_after(&small_list);
  blob_it.set_to_list(&noise_blobs);
  blob_it.add_list_after(&noise_list);
}
Esempio n. 21
0
// Helper to call CleanNeighbours on all blobs on the list.
void BLOBNBOX::CleanNeighbours(BLOBNBOX_LIST* blobs) {
  BLOBNBOX_IT blob_it(blobs);
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
    blob_it.data()->CleanNeighbours();
  }
}
Esempio n. 22
0
// Fits a straight baseline to the points. Returns true if it had enough
// points to be reasonably sure of the fitted baseline.
// If use_box_bottoms is false, baselines positions are formed by
// considering the outlines of the blobs.
bool BaselineRow::FitBaseline(bool use_box_bottoms) {
  // Deterministic fitting is used wherever possible.
  fitter_.Clear();
  // Linear least squares is a backup if the DetLineFit produces a bad line.
  LLSQ llsq;
  BLOBNBOX_IT blob_it(blobs_);

  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
    BLOBNBOX* blob = blob_it.data();
    if (!use_box_bottoms) blob->EstimateBaselinePosition();
    const TBOX& box = blob->bounding_box();
    int x_middle = (box.left() + box.right()) / 2;
#ifdef kDebugYCoord
    if (box.bottom() < kDebugYCoord && box.top() > kDebugYCoord) {
      tprintf("Box bottom = %d, baseline pos=%d for box at:",
              box.bottom(), blob->baseline_position());
      box.print();
    }
#endif
    fitter_.Add(ICOORD(x_middle, blob->baseline_position()), box.width() / 2);
    llsq.add(x_middle, blob->baseline_position());
  }
  // Fit the line.
  ICOORD pt1, pt2;
  baseline_error_ = fitter_.Fit(&pt1, &pt2);
  baseline_pt1_ = pt1;
  baseline_pt2_ = pt2;
  if (baseline_error_ > max_baseline_error_ &&
      fitter_.SufficientPointsForIndependentFit()) {
    // The fit was bad but there were plenty of points, so try skipping
    // the first and last few, and use the new line if it dramatically improves
    // the error of fit.
    double error = fitter_.Fit(kNumSkipPoints, kNumSkipPoints, &pt1, &pt2);
    if (error < baseline_error_ / 2.0) {
      baseline_error_ = error;
      baseline_pt1_ = pt1;
      baseline_pt2_ = pt2;
    }
  }
  int debug = 0;
#ifdef kDebugYCoord
  Print();
  debug = bounding_box_.bottom() < kDebugYCoord &&
      bounding_box_.top() > kDebugYCoord
            ? 3 : 2;
#endif
  // Now we obtained a direction from that fit, see if we can improve the
  // fit using the same direction and some other start point.
  FCOORD direction(pt2 - pt1);
  double target_offset = direction * pt1;
  good_baseline_ = false;
  FitConstrainedIfBetter(debug, direction, 0.0, target_offset);
  // Wild lines can be produced because DetLineFit allows vertical lines, but
  // vertical text has been rotated so angles over pi/4 should be disallowed.
  // Near vertical lines can still be produced by vertically aligned components
  // on very short lines.
  double angle = BaselineAngle();
  if (fabs(angle) > M_PI * 0.25) {
    // Use the llsq fit as a backup.
    baseline_pt1_ = llsq.mean_point();
    baseline_pt2_ = baseline_pt1_ + FCOORD(1.0f, llsq.m());
    // TODO(rays) get rid of this when m and c are no longer used.
    double m = llsq.m();
    double c = llsq.c(m);
    baseline_error_ = llsq.rms(m, c);
    good_baseline_ = false;
  }
  return good_baseline_;
}
Esempio n. 23
0
/// Consume all source blobs that strongly overlap the given box,
/// putting them into a new word, with the correct_text label.
/// Fights over which box owns which blobs are settled by
/// applying the blobs to box or next_box with the least non-overlap.
/// @return false if the box was in error, which can only be caused by
/// failing to find an overlapping blob for a box.
bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
                                 const TBOX& box, const TBOX& next_box,
                                 const char* correct_text) {
  if (applybox_debug > 1) {
    tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
  }
  WERD* new_word = NULL;
  BLOCK_IT b_it(block_list);
  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
    BLOCK* block = b_it.data();
    if (!box.major_overlap(block->bounding_box()))
      continue;
    ROW_IT r_it(block->row_list());
    for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
      ROW* row = r_it.data();
      if (!box.major_overlap(row->bounding_box()))
        continue;
      WERD_IT w_it(row->word_list());
      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
        WERD* word = w_it.data();
        if (applybox_debug > 2) {
          tprintf("Checking word:");
          word->bounding_box().print();
        }
        if (word->text() != NULL && word->text()[0] != '\0')
          continue;  // Ignore words that are already done.
        if (!box.major_overlap(word->bounding_box()))
          continue;
        C_BLOB_IT blob_it(word->cblob_list());
        for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
             blob_it.forward()) {
          C_BLOB* blob = blob_it.data();
          TBOX blob_box = blob->bounding_box();
          if (!blob_box.major_overlap(box))
            continue;
          double current_box_miss_metric = BoxMissMetric(blob_box, box);
          double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
          if (applybox_debug > 2) {
            tprintf("Checking blob:");
            blob_box.print();
            tprintf("Current miss metric = %g, next = %g\n",
                    current_box_miss_metric, next_box_miss_metric);
          }
          if (current_box_miss_metric > next_box_miss_metric)
            continue;  // Blob is a better match for next box.
          if (applybox_debug > 2) {
            tprintf("Blob match: blob:");
            blob_box.print();
            tprintf("Matches box:");
            box.print();
            tprintf("With next box:");
            next_box.print();
          }
          if (new_word == NULL) {
            // Make a new word with a single blob.
            new_word = word->shallow_copy();
            new_word->set_text(correct_text);
            w_it.add_to_end(new_word);
          }
          C_BLOB_IT new_blob_it(new_word->cblob_list());
          new_blob_it.add_to_end(blob_it.extract());
        }
      }
    }
  }
  if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n");
  return new_word != NULL;
}
Esempio n. 24
0
// Places a copy of blobs that are near a word (after applying rotation to the
// blob) in the most appropriate word, unless there is doubt, in which case a
// blob can end up in two words. Source blobs are not touched.
void Textord::TransferDiacriticsToWords(BLOBNBOX_LIST* diacritic_blobs,
                                        const FCOORD& rotation,
                                        WordGrid* word_grid) {
  WordSearch ws(word_grid);
  BLOBNBOX_IT b_it(diacritic_blobs);
  // Apply rotation to each blob before finding the nearest words. The rotation
  // allows us to only consider above/below placement and not left/right on
  // vertical text, because all text is horizontal here.
  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
    BLOBNBOX* blobnbox = b_it.data();
    TBOX blob_box = blobnbox->bounding_box();
    blob_box.rotate(rotation);
    ws.StartRectSearch(blob_box);
    // Above/below refer to word position relative to diacritic. Since some
    // scripts eg Kannada/Telugu habitually put diacritics below words, and
    // others eg Thai/Vietnamese/Latin put most diacritics above words, try
    // for both if there isn't much in it.
    WordWithBox* best_above_word = nullptr;
    WordWithBox* best_below_word = nullptr;
    int best_above_distance = 0;
    int best_below_distance = 0;
    for (WordWithBox* word = ws.NextRectSearch(); word != nullptr;
         word = ws.NextRectSearch()) {
      if (word->word()->flag(W_REP_CHAR)) continue;
      TBOX word_box = word->true_bounding_box();
      int x_distance = blob_box.x_gap(word_box);
      int y_distance = blob_box.y_gap(word_box);
      if (x_distance > 0) {
        // Arbitrarily divide x-distance by 2 if there is a major y overlap,
        // and the word is to the left of the diacritic. If the
        // diacritic is a dropped broken character between two words, this will
        // help send all the pieces to a single word, instead of splitting them
        // over the 2 words.
        if (word_box.major_y_overlap(blob_box) &&
            blob_box.left() > word_box.right()) {
          x_distance /= 2;
        }
        y_distance += x_distance;
      }
      if (word_box.y_middle() > blob_box.y_middle() &&
          (best_above_word == nullptr || y_distance < best_above_distance)) {
        best_above_word = word;
        best_above_distance = y_distance;
      }
      if (word_box.y_middle() <= blob_box.y_middle() &&
          (best_below_word == nullptr || y_distance < best_below_distance)) {
        best_below_word = word;
        best_below_distance = y_distance;
      }
    }
    bool above_good =
        best_above_word != nullptr &&
        (best_below_word == nullptr ||
         best_above_distance < best_below_distance + blob_box.height());
    bool below_good =
        best_below_word != nullptr && best_below_word != best_above_word &&
        (best_above_word == nullptr ||
         best_below_distance < best_above_distance + blob_box.height());
    if (below_good) {
      C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob());
      copied_blob->rotate(rotation);
      // Put the blob into the word's reject blobs list.
      C_BLOB_IT blob_it(best_below_word->RejBlobs());
      blob_it.add_to_end(copied_blob);
    }
    if (above_good) {
      C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob());
      copied_blob->rotate(rotation);
      // Put the blob into the word's reject blobs list.
      C_BLOB_IT blob_it(best_above_word->RejBlobs());
      blob_it.add_to_end(copied_blob);
    }
  }
}
Esempio n. 25
0
void collect_ems_for_adaption(WERD_RES *word,
                              CHAR_SAMPLES_LIST *char_clusters,
                              CHAR_SAMPLE_LIST *chars_waiting) {
  PBLOB_LIST *blobs = word->outword->blob_list ();
  PBLOB_IT blob_it(blobs);
  inT16 i;
  CHAR_SAMPLE *sample;
  PIXROW_LIST *pixrow_list;
  PIXROW_IT pixrow_it;
  IMAGELINE *imlines;            // lines of the image
  TBOX pix_box;                   // box of imlines
  // extent
  WERD copy_outword;             // copy to denorm
  PBLOB_IT copy_blob_it;
  OUTLINE_IT copy_outline_it;
  inT32 resolution = page_image.get_res ();

  if (tessedit_reject_ems || tessedit_reject_suspect_ems)
    return;                      // Do nothing

  if (word->word->bounding_box ().height () > resolution / 3)
    return;

  if (tessedit_demo_adaption)
                                 // Make sure not set
    tessedit_display_mm.set_value (FALSE);

  if (word_adaptable (word, tessedit_em_adaption_mode)
    && word->reject_map.reject_count () == 0
    && (strchr (word->best_choice->string ().string (), 'm') != NULL
    || (tessedit_process_rns
    && strstr (word->best_choice->string ().string (),
  "rn") != NULL))) {
    if (tessedit_process_rns
    && strstr (word->best_choice->string ().string (), "rn") != NULL) {
      copy_outword = *(word->outword);
      copy_blob_it.set_to_list (copy_outword.blob_list ());
      i = 0;
      while (word->best_choice->string ()[i] != '\0') {
        if (word->best_choice->string ()[i] == 'r'
        && word->best_choice->string ()[i + 1] == 'n') {
          copy_outline_it.set_to_list (copy_blob_it.data ()->
            out_list ());
          copy_outline_it.add_list_after (copy_blob_it.
            data_relative (1)->
            out_list ());
          copy_blob_it.forward ();
          delete (copy_blob_it.extract ());
          i++;
        }
        copy_blob_it.forward ();
        i++;
      }
    }
    else
      copy_outword = *(word->outword);

    copy_outword.baseline_denormalise (&word->denorm);
    char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box);
    pixrow_it.set_to_list (pixrow_list);
    pixrow_it.move_to_first ();

    blob_it.move_to_first ();
    for (i = 0;
      word->best_choice->string ()[i] != '\0';
    i++, pixrow_it.forward (), blob_it.forward ()) {

      if (word->best_choice->string ()[i] == 'm'
        || (word->best_choice->string ()[i] == 'r'
      && word->best_choice->string ()[i + 1] == 'n')) {
        #ifndef SECURE_NAMES
        if (tessedit_cluster_debug)
          tprintf ("Sample %c for adaption found in %s, index %d\n",
            word->best_choice->string ()[i],
            word->best_choice->string ().string (), i);
        #endif
        if (tessedit_matrix_match) {
          sample = clip_sample (pixrow_it.data (),
            imlines,
            pix_box,
            copy_outword.flag (W_INVERSE),
            word->best_choice->string ()[i]);

          if (sample == NULL) {  //Clip failed
            #ifndef SECURE_NAMES
            tprintf ("Unable to clip sample from %s, index %d\n",
              word->best_choice->string ().string (), i);
            #endif
            if (word->best_choice->string ()[i] == 'r')
              i++;

            continue;
          }
        }
        else
          sample = new CHAR_SAMPLE (blob_it.data (),
            &word->denorm,
            word->best_choice->string ()[i]);

        cluster_sample(sample, char_clusters, chars_waiting);

        if (word->best_choice->string ()[i] == 'r')
          i++;                   // Skip next character
      }
    }
    delete[]imlines;             // Free array of imlines
    delete pixrow_list;
  }
}
Esempio n. 26
0
// Creates and returns a Pix with the same resolution as the original
// in which 1 (black) pixels represent likely non text (photo, line drawing)
// areas of the page, deleting from the blob_block the blobs that were
// determined to be non-text.
// The photo_map is used to bias the decision towards non-text, rather than
// supplying definite decision.
// The blob_block is the usual result of connected component analysis,
// holding the detected blobs.
// The returned Pix should be PixDestroyed after use.
Pix* CCNonTextDetect::ComputeNonTextMask(bool debug, Pix* photo_map,
        TO_BLOCK* blob_block) {
    // Insert the smallest blobs into the grid.
    InsertBlobList(&blob_block->small_blobs);
    InsertBlobList(&blob_block->noise_blobs);
    // Add the medium blobs that don't have a good strokewidth neighbour.
    // Those that do go into good_grid as an antidote to spreading beyond the
    // real reaches of a noise region.
    BlobGrid good_grid(gridsize(), bleft(), tright());
    BLOBNBOX_IT blob_it(&blob_block->blobs);
    for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
        BLOBNBOX* blob = blob_it.data();
        double perimeter_area_ratio = blob->cblob()->perimeter() / 4.0;
        perimeter_area_ratio *= perimeter_area_ratio / blob->enclosed_area();
        if (blob->GoodTextBlob() == 0 || perimeter_area_ratio < kMinGoodTextPARatio)
            InsertBBox(true, true, blob);
        else
            good_grid.InsertBBox(true, true, blob);
    }
    noise_density_ = ComputeNoiseDensity(debug, photo_map, &good_grid);
    good_grid.Clear();  // Not needed any more.
    Pix* pix = noise_density_->ThresholdToPix(max_noise_count_);
    if (debug) {
        pixWrite("junknoisemask.png", pix, IFF_PNG);
    }
    ScrollView* win = NULL;
#ifndef GRAPHICS_DISABLED
    if (debug) {
        win = MakeWindow(0, 400, "Photo Mask Blobs");
    }
#endif  // GRAPHICS_DISABLED
    // Large and medium blobs are not text if they overlap with "a lot" of small
    // blobs.
    MarkAndDeleteNonTextBlobs(&blob_block->large_blobs,
                              kMaxLargeOverlapsWithSmall,
                              win, ScrollView::DARK_GREEN, pix);
    MarkAndDeleteNonTextBlobs(&blob_block->blobs, kMaxMediumOverlapsWithSmall,
                              win, ScrollView::WHITE, pix);
    // Clear the grid of small blobs and insert the medium blobs.
    Clear();
    InsertBlobList(&blob_block->blobs);
    MarkAndDeleteNonTextBlobs(&blob_block->large_blobs,
                              kMaxLargeOverlapsWithMedium,
                              win, ScrollView::DARK_GREEN, pix);
    // Clear again before we start deleting the blobs in the grid.
    Clear();
    MarkAndDeleteNonTextBlobs(&blob_block->noise_blobs, -1,
                              win, ScrollView::CORAL, pix);
    MarkAndDeleteNonTextBlobs(&blob_block->small_blobs, -1,
                              win, ScrollView::GOLDENROD, pix);
    MarkAndDeleteNonTextBlobs(&blob_block->blobs, -1,
                              win, ScrollView::WHITE, pix);
    if (debug) {
#ifndef GRAPHICS_DISABLED
        win->Update();
#endif  // GRAPHICS_DISABLED
        pixWrite("junkccphotomask.png", pix, IFF_PNG);
#ifndef GRAPHICS_DISABLED
        delete win->AwaitEvent(SVET_DESTROY);
        delete win;
#endif  // GRAPHICS_DISABLED
    }
    return pix;
}
Esempio n. 27
0
void collect_characters_for_adaption(WERD_RES *word,
                                     CHAR_SAMPLES_LIST *char_clusters,
                                     CHAR_SAMPLE_LIST *chars_waiting) {
  PBLOB_LIST *blobs = word->outword->blob_list ();
  PBLOB_IT blob_it(blobs);
  inT16 i;
  CHAR_SAMPLE *sample;
  PIXROW_LIST *pixrow_list;
  PIXROW_IT pixrow_it;
  IMAGELINE *imlines;            // lines of the image
  TBOX pix_box;                   // box of imlines
  // extent
  WERD copy_outword;             // copy to denorm
  inT32 resolution = page_image.get_res ();

  if (word->word->bounding_box ().height () > resolution / 3)
    return;

  if (tessedit_demo_adaption)
                                 // Make sure not set
    tessedit_display_mm.set_value (FALSE);

  if ((word_adaptable (word, tessedit_cluster_adaption_mode)
  && word->reject_map.reject_count () == 0) || tessedit_mm_use_rejmap) {
    if (tessedit_test_cluster_input && !tessedit_mm_use_rejmap)
      return;                    // Reject map set to acceptable
    /* Collect information about good matches */
    copy_outword = *(word->outword);
    copy_outword.baseline_denormalise (&word->denorm);
    char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box);
    pixrow_it.set_to_list (pixrow_list);
    pixrow_it.move_to_first ();

    blob_it.move_to_first ();
    for (i = 0;
      word->best_choice->string ()[i] != '\0';
    i++, pixrow_it.forward (), blob_it.forward ()) {

      if (!(tessedit_mm_use_non_adaption_set
        && STRING (tessedit_non_adaption_set).contains (word->
        best_choice->
        string ()[i]))
      || (tessedit_mm_use_rejmap && word->reject_map[i].accepted ())) {
        #ifndef SECURE_NAMES
        if (tessedit_cluster_debug)
          tprintf ("Sample %c for adaption found in %s, index %d\n",
            word->best_choice->string ()[i],
            word->best_choice->string ().string (), i);
        #endif
        sample = clip_sample (pixrow_it.data (),
          imlines,
          pix_box,
          copy_outword.flag (W_INVERSE),
          word->best_choice->string ()[i]);

        if (sample == NULL) {    //Clip failed
          #ifndef SECURE_NAMES
          tprintf ("Unable to clip sample from %s, index %d\n",
            word->best_choice->string ().string (), i);
          #endif
          continue;
        }
        cluster_sample(sample, char_clusters, chars_waiting);
      }
    }
    delete[]imlines;             // Free array of imlines
    delete pixrow_list;
  }
  else if (tessedit_test_cluster_input && !tessedit_mm_use_rejmap)
    // Set word to all rejects
    word->reject_map.rej_word_tess_failure ();

}
Esempio n. 28
0
void adapt_to_good_samples(WERD_RES *word,
                           CHAR_SAMPLES_LIST *char_clusters,
                           CHAR_SAMPLE_LIST *chars_waiting) {
  PBLOB_LIST *blobs = word->outword->blob_list ();
  PBLOB_IT blob_it(blobs);
  inT16 i;
  CHAR_SAMPLE *sample;
  CHAR_SAMPLES_IT c_it = char_clusters;
  CHAR_SAMPLE_IT cw_it = chars_waiting;
  float score;
  float best_score;
  char best_char;
  CHAR_SAMPLES *best_cluster;
  PIXROW_LIST *pixrow_list;
  PIXROW_IT pixrow_it;
  IMAGELINE *imlines;            // lines of the image
  TBOX pix_box;                   // box of imlines
  // extent
  WERD copy_outword;             // copy to denorm
  TBOX b_box;
  PBLOB_IT copy_blob_it;
  PIXROW *pixrow = NULL;

  static inT32 word_number = 0;

#ifndef GRAPHICS_DISABLED
  ScrollView* demo_win = NULL;
#endif

  inT32 resolution = page_image.get_res ();

  word_number++;

  if (tessedit_test_cluster_input)
    return;

  if (word->word->bounding_box ().height () > resolution / 3)
    return;

  if (char_clusters->length () == 0) {
    #ifndef SECURE_NAMES
    if (tessedit_cluster_debug)
      tprintf ("No clusters to use for adaption\n");
    #endif
    return;
  }

  if (!cw_it.empty ()) {
    complete_clustering(char_clusters, chars_waiting);
    print_em_stats(char_clusters, chars_waiting);
  }

  if ((!word_adaptable (word, tessedit_cluster_adaption_mode)
  && word->reject_map.reject_count () != 0) || tessedit_mm_use_rejmap) {
    if (tessedit_cluster_debug) {
      tprintf ("\nChecking: \"%s\"  MAP ",
        word->best_choice->string ().string ());
      word->reject_map.print (debug_fp);
      tprintf ("\n");
    }

    copy_outword = *(word->outword);
    copy_outword.baseline_denormalise (&word->denorm);
    copy_blob_it.set_to_list (copy_outword.blob_list ());
    char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box);
    pixrow_it.set_to_list (pixrow_list);
    pixrow_it.move_to_first ();

                                 // For debugging only
    b_box = copy_outword.bounding_box ();
    pixrow = pixrow_it.data ();

    blob_it.move_to_first ();
    copy_blob_it.move_to_first ();
    for (i = 0;
      word->best_choice->string ()[i] != '\0';
      i++, pixrow_it.forward (), blob_it.forward (),
    copy_blob_it.forward ()) {
      if (word->reject_map[i].recoverable ()
      || (tessedit_mm_all_rejects && word->reject_map[i].rejected ())) {
        TBOX copy_box = copy_blob_it.data ()->bounding_box ();

        if (tessedit_cluster_debug)
          tprintf ("Sample %c to check found in %s, index %d\n",
            word->best_choice->string ()[i],
            word->best_choice->string ().string (), i);

        if (tessedit_demo_adaption)
          tprintf ("Sample %c to check found in %s (%d), index %d\n",
            word->best_choice->string ()[i],
            word->best_choice->string ().string (),
            word_number, i);

        sample = clip_sample (pixrow_it.data (),
          imlines,
          pix_box,
          copy_outword.flag (W_INVERSE),
          word->best_choice->string ()[i]);

        if (sample == NULL) {    //Clip failed
          tprintf ("Unable to clip sample from %s, index %d\n",
            word->best_choice->string ().string (), i);
          #ifndef SECURE_NAMES
          if (tessedit_cluster_debug)
            tprintf ("Sample rejected (no sample)\n");
          #endif
          word->reject_map[i].setrej_mm_reject ();

          continue;
        }

        best_score = MAX_INT32;
        best_char = '\0';
        best_cluster = NULL;

        for (c_it.mark_cycle_pt ();
        !c_it.cycled_list (); c_it.forward ()) {
          if (c_it.data ()->character () != '\0') {
            score = c_it.data ()->match_score (sample);
            if (score < best_score) {
              best_cluster = c_it.data ();
              best_score = score;
              best_char = c_it.data ()->character ();
            }
          }
        }

        if (best_score > tessedit_cluster_t1) {
          #ifndef SECURE_NAMES
          if (tessedit_cluster_debug)
            tprintf ("Sample rejected (score %f)\n", best_score);
          if (tessedit_demo_adaption)
            tprintf ("Sample rejected (score %f)\n", best_score);
          #endif
          word->reject_map[i].setrej_mm_reject ();
        }
        else {
          if (word->best_choice->string ()[i] == best_char) {
            #ifndef SECURE_NAMES
            if (tessedit_cluster_debug)
              tprintf ("Sample accepted (score %f)\n", best_score);
            if (tessedit_demo_adaption)
              tprintf ("Sample accepted (score %f)\n", best_score);
            #endif
            if (tessedit_test_adaption)
              word->reject_map[i].setrej_minimal_rej_accept ();
            else
              word->reject_map[i].setrej_mm_accept ();
          }
          else {
            #ifndef SECURE_NAMES
            if (tessedit_cluster_debug)
              tprintf ("Sample rejected (char %c, score %f)\n",
                best_char, best_score);
            if (tessedit_demo_adaption)
              tprintf ("Sample rejected (char %c, score %f)\n",
                best_char, best_score);
            #endif
            word->reject_map[i].setrej_mm_reject ();
          }
        }

        if (tessedit_demo_adaption) {
          if (strcmp (imagebasename.string (),
            tessedit_demo_file.string ()) != 0
            || word_number == tessedit_demo_word1
          || word_number == tessedit_demo_word2) {
#ifndef GRAPHICS_DISABLED
            demo_win =
              display_clip_image(&copy_outword,
                                 page_image,
                                 pixrow_list,
                                 pix_box);
#endif
            demo_word = word_number;
            best_cluster->match_score (sample);
            demo_word = 0;
          }
        }
      }
    }
    delete[]imlines;             // Free array of imlines
    delete pixrow_list;

    if (tessedit_cluster_debug) {
      tprintf ("\nFinal: \"%s\"  MAP ",
        word->best_choice->string ().string ());
      word->reject_map.print (debug_fp);
      tprintf ("\n");
    }
  }
}