Esempi in C++ (Cpp) per TBLOB, esempi in C++ (Cpp) per TBLOB

Esempio n. 1

0

Mostra file

File: plotseg.cpp Progetto: slohman/October2012Workspace

/**********************************************************************
 * render_segmentation
 *
 * Create a list of line segments that represent the list of chunks
 * using the correct segmentation that was supplied as input.
 **********************************************************************/
void render_segmentation(ScrollView *window,
                         TBLOB *chunks,
                         SEARCH_STATE segmentation) {
  TBLOB *blob;
  C_COL color = Black;
  int char_num = -1;
  int chunks_left = 0;

  TBOX bbox;
  if (chunks) bbox = chunks->bounding_box();

  for (blob = chunks; blob != NULL; blob = blob->next) {
    bbox += blob->bounding_box();
    if (chunks_left-- == 0) {
      color = color_list[++char_num % NUM_COLORS];

      if (char_num < segmentation[0])
        chunks_left = segmentation[char_num + 1];
      else
        chunks_left = MAX_INT32;
    }
    render_outline(window, blob->outlines, color);
  }
  window->ZoomToRectangle(bbox.left(), bbox.top(),
                          bbox.right(), bbox.bottom());
}

Esempio n. 2

0

Mostra file

File: blobs.cpp Progetto: ErwcPKerr/node-dv

/**********************************************************************
 * blobs_widths
 *
 * Compute the widths of a list of blobs. Return an array of the widths
 * and gaps.
 **********************************************************************/
WIDTH_RECORD *blobs_widths(TBLOB *blobs) {  /*blob to compute on */
  WIDTH_RECORD *width_record;
  TPOINT topleft;                /*bounding box */
  TPOINT botright;
  int i = 0;
  int blob_end;
  int num_blobs = count_blobs (blobs);

  /* Get memory */
  width_record = (WIDTH_RECORD *) memalloc (sizeof (int) * num_blobs * 2);
  width_record->num_chars = num_blobs;

  TBOX bbox = blobs->bounding_box();
  width_record->widths[i++] = bbox.width();
  /* First width */
  blob_end = bbox.right();

  for (TBLOB* blob = blobs->next; blob != NULL; blob = blob->next) {
    TBOX curbox = blob->bounding_box();
    width_record->widths[i++] = curbox.left() - blob_end;
    width_record->widths[i++] = curbox.width();
    blob_end = curbox.right();
  }
  return width_record;
}

Esempio n. 3

0

Mostra file

File: blobs.cpp Progetto: coffeesam/tesseract-ocr

void TWERD::plot(ScrollView* window) {
  ScrollView::Color color = WERD::NextColor(ScrollView::BLACK);
  for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) {
    blob->plot(window, color, ScrollView::BROWN);
    color = WERD::NextColor(color);
  }
}

Esempio n. 4

0

Mostra file

File: fixxht.cpp Progetto: ErwcPKerr/node-dv

// Returns the number of misfit blob tops in this word.
int Tesseract::CountMisfitTops(WERD_RES *word_res) {
  int bad_blobs = 0;
  TBLOB* blob = word_res->rebuild_word->blobs;
  int blob_id = 0;
  for (; blob != NULL; blob = blob->next, ++blob_id) {
    UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
    if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
      int top = blob->bounding_box().top();
      if (top >= INT_FEAT_RANGE)
        top = INT_FEAT_RANGE - 1;
      int min_bottom, max_bottom, min_top, max_top;
      unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
                                &min_top, &max_top);
      if (max_top - min_top > kMaxCharTopRange)
        continue;
      bool bad =  top < min_top - x_ht_acceptance_tolerance ||
                  top > max_top + x_ht_acceptance_tolerance;
      if (bad)
        ++bad_blobs;
      if (debug_x_ht_level >= 1) {
        tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
                unicharset.id_to_unichar(class_id),
                bad ? "Misfit" : "OK", top, min_top, max_top,
                static_cast<int>(x_ht_acceptance_tolerance));
      }
    }
  }
  return bad_blobs;
}

Esempio n. 5

0

Mostra file

File: chopper.cpp Progetto: 0359xiaodong/tess-two

/**
 * @name chop_one_blob
 *
 * Start with the current one-blob word and its classification.  Find
 * the worst blobs and try to divide it up to improve the ratings.
 * Used for testing chopper.
 */
bool Wordrec::chop_one_blob(TWERD *word,
                               BLOB_CHOICE_LIST_VECTOR *char_choices,
                               inT32 *blob_number,
                               SEAMS *seam_list,
                               int *right_chop_index) {
  TBLOB *blob;
  inT16 x = 0;
  float rating_ceiling = MAX_FLOAT32;
  BLOB_CHOICE_LIST *answer;
  BLOB_CHOICE_IT answer_it;
  SEAM *seam;
  UNICHAR_ID unichar_id = 0;
  int left_chop_index = 0;

  do {
    *blob_number = select_blob_to_split(*char_choices, rating_ceiling, false);
    if (chop_debug)
      cprintf("blob_number = %d\n", *blob_number);
    if (*blob_number == -1)
      return false;
    seam = attempt_blob_chop(word, *blob_number, true, *seam_list);
    if (seam != NULL)
      break;
    /* Must split null blobs */
    answer = char_choices->get(*blob_number);
    if (answer == NULL)
      return false;
    answer_it.set_to_list(answer);
    rating_ceiling = answer_it.data()->rating();  // try a different blob
  } while (true);
  /* Split OK */
  for (blob = word->blobs; x < *blob_number; x++) {
    blob = blob->next;
  }
  if (chop_debug) {
    tprintf("Chop made blob1:");
    blob->bounding_box().print();
    tprintf("and blob2:");
    blob->next->bounding_box().print();
  }
  *seam_list = insert_seam(*seam_list, *blob_number, seam, blob, word->blobs);

  answer = char_choices->get(*blob_number);
  answer_it.set_to_list(answer);
  unichar_id = answer_it.data()->unichar_id();
  float rating = answer_it.data()->rating() / exp(1.0);
  left_chop_index = atoi(unicharset.id_to_unichar(unichar_id));

  delete char_choices->get(*blob_number);
  // combine confidence w/ serial #
  answer = fake_classify_blob(0, rating, -rating);
  modify_blob_choice(answer, left_chop_index);
  char_choices->insert(answer, *blob_number);

  answer = fake_classify_blob(0, rating - 0.125f, -rating);
  modify_blob_choice(answer, ++*right_chop_index);
  char_choices->set(answer, *blob_number + 1);
  return true;
}

Esempio n. 6

0

Mostra file

File: blobs.cpp Progetto: coffeesam/tesseract-ocr

TBOX TWERD::bounding_box() const {
  TBOX result;
  for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) {
    TBOX box = blob->bounding_box();
    result += box;
  }
  return result;
}

Esempio n. 7

0

Mostra file

File: pieces.cpp Progetto: ErfanHasmin/scope-ocr

/**********************************************************************
 * record_blob_bounds
 *
 * Set up and initialize an array that holds the bounds of a set of
 * blobs.  Caller should delete[] the array.
 **********************************************************************/
TBOX *Wordrec::record_blob_bounds(TBLOB *blobs) {
  int nblobs = count_blobs(blobs);
  TBOX *bboxes = new TBOX[nblobs];

  inT16 x = 0;
  for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) {
    bboxes[x] = blob->bounding_box();
    x++;
  }
  return bboxes;
}

Esempio n. 8

0

Mostra file

File: reject.cpp Progetto: xmarston/BillRecognizer

// Note: After running this function word_res->ratings
// might not contain the right BLOB_CHOICE corresponding to each character
// in word_res->best_choice.
void Tesseract::flip_hyphens(WERD_RES *word_res) {
  WERD_CHOICE *best_choice = word_res->best_choice;
  int i;
  int prev_right = -9999;
  int next_left;
  TBOX out_box;
  float aspect_ratio;

  if (tessedit_lower_flip_hyphen <= 1)
    return;

  int num_blobs = word_res->rebuild_word->NumBlobs();
  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
  for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
    TBLOB* blob = word_res->rebuild_word->blobs[i];
    out_box = blob->bounding_box();
    if (i + 1 == num_blobs)
      next_left = 9999;
    else
      next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
    // Dont touch small or touching blobs - it is too dangerous.
    if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
        (out_box.left() > prev_right) && (out_box.right() < next_left)) {
      aspect_ratio = out_box.width() / (float) out_box.height();
      if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
        if (aspect_ratio >= tessedit_upper_flip_hyphen &&
            word_res->uch_set->contains_unichar_id(unichar_dash) &&
            word_res->uch_set->get_enabled(unichar_dash)) {
          /* Certain HYPHEN */
          best_choice->set_unichar_id(unichar_dash, i);
          if (word_res->reject_map[i].rejected())
            word_res->reject_map[i].setrej_hyphen_accept();
        }
        if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
          word_res->reject_map[i].accepted())
                                 //Suspected HYPHEN
          word_res->reject_map[i].setrej_hyphen ();
      }
      else if (best_choice->unichar_id(i) == unichar_dash) {
        if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
          (word_res->reject_map[i].rejected()))
          word_res->reject_map[i].setrej_hyphen_accept();
        //Certain HYPHEN

        if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
          (word_res->reject_map[i].accepted()))
                                 //Suspected HYPHEN
          word_res->reject_map[i].setrej_hyphen();
      }
    }
    prev_right = out_box.right();
  }
}

Esempio n. 9

0

Mostra file

File: blobs.cpp Progetto: 11110101/tess-two

// Baseline normalizes the blobs in-place, recording the normalization in the
// DENORMs in the blobs.
void TWERD::BLNormalize(const BLOCK* block, const ROW* row, Pix* pix,
                        bool inverse, float x_height, bool numeric_mode,
                        tesseract::OcrEngineMode hint,
                        const TBOX* norm_box,
                        DENORM* word_denorm) {
  TBOX word_box = bounding_box();
  if (norm_box != NULL) word_box = *norm_box;
  float word_middle = (word_box.left() + word_box.right()) / 2.0f;
  float input_y_offset = 0.0f;
  float final_y_offset = static_cast<float>(kBlnBaselineOffset);
  float scale = kBlnXHeight / x_height;
  if (hint == tesseract::OEM_CUBE_ONLY || row == NULL) {
    word_middle = word_box.left();
    input_y_offset = word_box.bottom();
    final_y_offset = 0.0f;
    if (hint == tesseract::OEM_CUBE_ONLY)
      scale = 1.0f;
  } else {
    input_y_offset = row->base_line(word_middle);
  }
  for (int b = 0; b < blobs.size(); ++b) {
    TBLOB* blob = blobs[b];
    TBOX blob_box = blob->bounding_box();
    float mid_x = (blob_box.left() + blob_box.right()) / 2.0f;
    float baseline = input_y_offset;
    float blob_scale = scale;
    if (numeric_mode) {
      baseline = blob_box.bottom();
      blob_scale = ClipToRange(kBlnXHeight * 4.0f / (3 * blob_box.height()),
                               scale, scale * 1.5f);
    } else if (row != NULL && hint != tesseract::OEM_CUBE_ONLY) {
      baseline = row->base_line(mid_x);
    }
    // The image will be 8-bit grey if the input was grey or color. Note that in
    // a grey image 0 is black and 255 is white. If the input was binary, then
    // the pix will be binary and 0 is white, with 1 being black.
    // To tell the difference pixGetDepth() will return 8 or 1.
    // The inverse flag will be true iff the word has been determined to be
    // white on black, and is independent of whether the pix is 8 bit or 1 bit.
    blob->Normalize(block, NULL, NULL, word_middle, baseline, blob_scale,
                    blob_scale, 0.0f, final_y_offset, inverse, pix);
  }
  if (word_denorm != NULL) {
    word_denorm->SetupNormalization(block, NULL, NULL, word_middle,
                                    input_y_offset, scale, scale,
                                    0.0f, final_y_offset);
    word_denorm->set_inverse(inverse);
    word_denorm->set_pix(pix);
  }
}

Esempio n. 10

0

Mostra file

File: intfx.cpp Progetto: dqsoft/tesseract

// Extracts sets of 3-D features of length kStandardFeatureLength (=12.8), as
// (x,y) position and angle as measured counterclockwise from the vector
// <-1, 0>, from blob using two normalizations defined by bl_denorm and
// cn_denorm. See SetpuBLCNDenorms for definitions.
// If outline_cn_counts is not nullptr, on return it contains the cumulative
// number of cn features generated for each outline in the blob (in order).
// Thus after the first outline, there were (*outline_cn_counts)[0] features,
// after the second outline, there were (*outline_cn_counts)[1] features etc.
void Classify::ExtractFeatures(const TBLOB& blob,
                               bool nonlinear_norm,
                               GenericVector<INT_FEATURE_STRUCT>* bl_features,
                               GenericVector<INT_FEATURE_STRUCT>* cn_features,
                               INT_FX_RESULT_STRUCT* results,
                               GenericVector<int>* outline_cn_counts) {
  DENORM bl_denorm, cn_denorm;
  tesseract::Classify::SetupBLCNDenorms(blob, nonlinear_norm,
                                        &bl_denorm, &cn_denorm, results);
  if (outline_cn_counts != nullptr)
    outline_cn_counts->truncate(0);
  // Iterate the outlines.
  for (TESSLINE* ol = blob.outlines; ol != nullptr; ol = ol->next) {
    // Iterate the polygon.
    EDGEPT* loop_pt = ol->FindBestStartPt();
    EDGEPT* pt = loop_pt;
    if (pt == nullptr) continue;
    do {
      if (pt->IsHidden()) continue;
      // Find a run of equal src_outline.
      EDGEPT* last_pt = pt;
      do {
        last_pt = last_pt->next;
      } while (last_pt != loop_pt && !last_pt->IsHidden() &&
               last_pt->src_outline == pt->src_outline);
      last_pt = last_pt->prev;
      // Until the adaptive classifier can be weaned off polygon segments,
      // we have to force extraction from the polygon for the bl_features.
      ExtractFeaturesFromRun(pt, last_pt, bl_denorm, kStandardFeatureLength,
                             true, bl_features);
      ExtractFeaturesFromRun(pt, last_pt, cn_denorm, kStandardFeatureLength,
                             false, cn_features);
      pt = last_pt;
    } while ((pt = pt->next) != loop_pt);
    if (outline_cn_counts != nullptr)
      outline_cn_counts->push_back(cn_features->size());
  }
  results->NumBL = bl_features->size();
  results->NumCN = cn_features->size();
  results->YBottom = blob.bounding_box().bottom();
  results->YTop = blob.bounding_box().top();
  results->Width = blob.bounding_box().width();
}

Esempio n. 11

0

Mostra file

File: ratngs.cpp Progetto: 11110101/tess-two

// Displays the segmentation state of *this (if not the same as the last
// one displayed) and waits for a click in the window.
void WERD_CHOICE::DisplaySegmentation(TWERD* word) {
#ifndef GRAPHICS_DISABLED
  // Number of different colors to draw with.
  const int kNumColors = 6;
  static ScrollView *segm_window = NULL;
  // Check the state against the static prev_drawn_state.
  static GenericVector<int> prev_drawn_state;
  bool already_done = prev_drawn_state.size() == length_;
  if (!already_done) prev_drawn_state.init_to_size(length_, 0);
  for (int i = 0; i < length_; ++i) {
    if (prev_drawn_state[i] != state_[i]) {
      already_done = false;
    }
    prev_drawn_state[i] = state_[i];
  }
  if (already_done || word->blobs.empty()) return;

  // Create the window if needed.
  if (segm_window == NULL) {
    segm_window = new ScrollView("Segmentation", 5, 10, 500, 256,
                                 2000.0, 256.0, true);
  } else {
    segm_window->Clear();
  }

  TBOX bbox;
  int blob_index = 0;
  for (int c = 0; c < length_; ++c) {
    ScrollView::Color color =
        static_cast<ScrollView::Color>(c % kNumColors + 3);
    for (int i = 0; i < state_[c]; ++i, ++blob_index) {
      TBLOB* blob = word->blobs[blob_index];
      bbox += blob->bounding_box();
      blob->plot(segm_window, color, color);
    }
  }
  segm_window->ZoomToRectangle(bbox.left(), bbox.top(),
                               bbox.right(), bbox.bottom());
  segm_window->Update();
  window_wait(segm_window);
#endif
}

Esempio n. 12

0

Mostra file

File: chopper.cpp Progetto: ManishKSharma/tess-two

SEAM *Wordrec::chop_overlapping_blob(const GenericVector<TBOX>& boxes,
                                     bool italic_blob, WERD_RES *word_res,
                                     int *blob_number) {
  TWERD *word = word_res->chopped_word;
  for (*blob_number = 0; *blob_number < word->NumBlobs(); ++*blob_number) {
    TBLOB *blob = word->blobs[*blob_number];
    TPOINT topleft, botright;
    topleft.x = blob->bounding_box().left();
    topleft.y = blob->bounding_box().top();
    botright.x = blob->bounding_box().right();
    botright.y = blob->bounding_box().bottom();

    TPOINT original_topleft, original_botright;
    word_res->denorm.DenormTransform(NULL, topleft, &original_topleft);
    word_res->denorm.DenormTransform(NULL, botright, &original_botright);

    TBOX original_box = TBOX(original_topleft.x, original_botright.y,
                             original_botright.x, original_topleft.y);

    bool almost_equal_box = false;
    int num_overlap = 0;
    for (int i = 0; i < boxes.size(); i++) {
      if (original_box.overlap_fraction(boxes[i]) > 0.125)
        num_overlap++;
      if (original_box.almost_equal(boxes[i], 3))
        almost_equal_box = true;
    }

    TPOINT location;
    if (divisible_blob(blob, italic_blob, &location) ||
        (!almost_equal_box && num_overlap > 1)) {
      SEAM *seam = attempt_blob_chop(word, blob, *blob_number,
                                     italic_blob, word_res->seam_array);
      if (seam != NULL)
        return seam;
    }
  }

  *blob_number = -1;
  return NULL;
}

Esempio n. 13

0

Mostra file

File: blobs.cpp Progetto: ErwcPKerr/node-dv

// Normalize in-place and record the normalization in the DENORM.
void TWERD::SetupBLNormalize(const BLOCK* block, const ROW* row,
                             float x_height, bool numeric_mode,
                             DENORM* denorm) const {
  int num_segments = 0;
  DENORM_SEG* segs = NULL;
  if (numeric_mode) {
    segs = new DENORM_SEG[NumBlobs()];
    for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) {
      TBOX blob_box = blob->bounding_box();
      float factor = kBlnXHeight / x_height;
      factor = ClipToRange(kBlnXHeight * 4.0f / (3 * blob_box.height()),
                           factor, factor * 1.5f);
      segs[num_segments].xstart = blob_box.left();
      segs[num_segments].ycoord = blob_box.bottom();
      segs[num_segments++].scale_factor = factor;
    }
  }
  denorm->SetupBLNormalize(block, row, x_height, bounding_box(),
                           num_segments, segs);
  delete [] segs;
}

Esempio n. 14

0

Mostra file

File: intfx.cpp Progetto: dqsoft/tesseract

// Computes the DENORMS for bl(baseline) and cn(character) normalization
// during feature extraction. The input denorm describes the current state
// of the blob, which is usually a baseline-normalized word.
// The Transforms setup are as follows:
// Baseline Normalized (bl) Output:
//   We center the grapheme by aligning the x-coordinate of its centroid with
//   x=128 and leaving the already-baseline-normalized y as-is.
//
// Character Normalized (cn) Output:
//   We align the grapheme's centroid at the origin and scale it
//   asymmetrically in x and y so that the 2nd moments are a standard value
//   (51.2) ie the result is vaguely square.
// If classify_nonlinear_norm is true:
//   A non-linear normalization is setup that attempts to evenly distribute
//   edges across x and y.
//
// Some of the fields of fx_info are also setup:
// Length: Total length of outline.
// Rx:     Rounded y second moment. (Reversed by convention.)
// Ry:     rounded x second moment.
// Xmean:  Rounded x center of mass of the blob.
// Ymean:  Rounded y center of mass of the blob.
void Classify::SetupBLCNDenorms(const TBLOB& blob, bool nonlinear_norm,
                                DENORM* bl_denorm, DENORM* cn_denorm,
                                INT_FX_RESULT_STRUCT* fx_info) {
  // Compute 1st and 2nd moments of the original outline.
  FCOORD center, second_moments;
  int length = blob.ComputeMoments(&center, &second_moments);
  if (fx_info != nullptr) {
    fx_info->Length = length;
    fx_info->Rx = IntCastRounded(second_moments.y());
    fx_info->Ry = IntCastRounded(second_moments.x());

    fx_info->Xmean = IntCastRounded(center.x());
    fx_info->Ymean = IntCastRounded(center.y());
  }
  // Setup the denorm for Baseline normalization.
  bl_denorm->SetupNormalization(nullptr, nullptr, &blob.denorm(), center.x(), 128.0f,
                                1.0f, 1.0f, 128.0f, 128.0f);
  // Setup the denorm for character normalization.
  if (nonlinear_norm) {
    GenericVector<GenericVector<int> > x_coords;
    GenericVector<GenericVector<int> > y_coords;
    TBOX box;
    blob.GetPreciseBoundingBox(&box);
    box.pad(1, 1);
    blob.GetEdgeCoords(box, &x_coords, &y_coords);
    cn_denorm->SetupNonLinear(&blob.denorm(), box, UINT8_MAX, UINT8_MAX,
                              0.0f, 0.0f, x_coords, y_coords);
  } else {
    cn_denorm->SetupNormalization(nullptr, nullptr, &blob.denorm(),
                                  center.x(), center.y(),
                                  51.2f / second_moments.x(),
                                  51.2f / second_moments.y(),
                                  128.0f, 128.0f);
  }
}

Esempio n. 15

0

Mostra file

File: M_Utils.cpp Progetto: Andy0898/Project_Isagoge

BLOB_CHOICE* M_Utils::runBlobOCR(BLOBNBOX* blob, Tesseract* ocrengine) {
  // * Normalize blob height to x-height (current OSD):
  // SetupNormalization(NULL, NULL, &rotation, NULL, NULL, 0,
  //                    box.rotational_x_middle(rotation),
  //                    box.rotational_y_middle(rotation),
  //                    kBlnXHeight / box.rotational_height(rotation),
  //                    kBlnXHeight / box.rotational_height(rotation),
  //                    0, kBlnBaselineOffset);
  BLOB_CHOICE_LIST ratings_lang;
  C_BLOB* cblob = blob->cblob();
  TBLOB* tblob = TBLOB::PolygonalCopy(cblob);
  const TBOX& box = tblob->bounding_box();

  // Normalize the blob. Set the origin to the place we want to be the
  // bottom-middle, and scaling is to mpx, box_, NULL);
  float scaling = static_cast<float>(kBlnXHeight) / box.height();
  DENORM denorm;
  float x_orig = (box.left() + box.right()) / 2.0f, y_orig = box.bottom();
  denorm.SetupNormalization(NULL, NULL, NULL, NULL, NULL, 0,
                            x_orig, y_orig, scaling, scaling,
                            0.0f, static_cast<float>(kBlnBaselineOffset));
  TBLOB* normed_blob = new TBLOB(*tblob);
  normed_blob->Normalize(denorm);
  ocrengine->AdaptiveClassifier(normed_blob, denorm, &ratings_lang, NULL);
  delete normed_blob;
  delete tblob;

  // Get the best choice from ratings_lang and rating_equ. As the choice in the
  // list has already been sorted by the certainty, we simply use the first
  // choice.
  BLOB_CHOICE *lang_choice = NULL;
  if (ratings_lang.length() > 0) {
    BLOB_CHOICE_IT choice_it(&ratings_lang);
    lang_choice = choice_it.data();
  }
  return lang_choice;
}

Esempio n. 16

0

Mostra file

File: blobs.cpp Progetto: coffeesam/tesseract-ocr

// Normalize in-place and record the normalization in the DENORM.
void TWERD::Normalize(ROW* row, float x_height, bool numeric_mode,
                      DENORM* denorm) {
  TBOX word_box = bounding_box();
  DENORM antidote((word_box.left() + word_box.right()) / 2.0,
                  kBlnXHeight / x_height, row);
  if (row == NULL) {
    antidote = DENORM(antidote.origin(), antidote.scale(), 0.0,
                      word_box.bottom(), 0, NULL, false, NULL);
  }
  int num_segments = 0;
  DENORM_SEG *segs = new DENORM_SEG[NumBlobs()];
  for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) {
    TBOX blob_box = blob->bounding_box();
    ICOORD translation(-static_cast<int>(floor(antidote.origin() + 0.5)),
                       -blob_box.bottom());
    float factor = antidote.scale();
    if (numeric_mode) {
      factor = ClipToRange(kBlnXHeight * 4.0f / (3 * blob_box.height()),
                           factor, factor * 1.5f);
      segs[num_segments].xstart = blob->bounding_box().left();
      segs[num_segments].ycoord = blob_box.bottom();
      segs[num_segments++].scale_factor = factor;
    } else {
      float blob_x_center = (blob_box.left() + blob_box.right()) / 2.0;
      float y_shift = antidote.yshift_at_orig_x(blob_x_center);
      translation.set_y(-static_cast<int>(floor(y_shift + 0.5)));
    }
    blob->Move(translation);
    blob->Scale(factor);
    blob->Move(ICOORD(0, kBlnBaselineOffset));
  }
  if (num_segments > 0) {
    antidote.set_segments(segs, num_segments);
  }
  delete [] segs;
  if (denorm != NULL)
    *denorm = antidote;
}

Esempio n. 17

0

Mostra file

File: fixxht.cpp Progetto: ErwcPKerr/node-dv

// Returns a new x-height maximally compatible with the result in word_res.
// See comment above for overall algorithm.
float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res) {
  STATS top_stats(0, MAX_UINT8);
  TBLOB* blob = word_res->rebuild_word->blobs;
  int blob_id = 0;
  for (; blob != NULL; blob = blob->next, ++blob_id) {
    UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
    if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
      int top = blob->bounding_box().top();
      // Clip the top to the limit of normalized feature space.
      if (top >= INT_FEAT_RANGE)
        top = INT_FEAT_RANGE - 1;
      int bottom = blob->bounding_box().bottom();
      int min_bottom, max_bottom, min_top, max_top;
      unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
                                &min_top, &max_top);
      // Chars with a wild top range would mess up the result so ignore them.
      if (max_top - min_top > kMaxCharTopRange)
        continue;
      int misfit_dist = MAX((min_top - x_ht_acceptance_tolerance) - top,
                          top - (max_top + x_ht_acceptance_tolerance));
      int height = top - kBlnBaselineOffset;
      if (debug_x_ht_level >= 20) {
        tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d : ",
                unicharset.id_to_unichar(class_id),
                height, min_bottom, max_bottom, min_top, max_top,
                bottom, top);
      }
      // Use only chars that fit in the expected bottom range, and where
      // the range of tops is sensibly near the xheight.
      if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
          bottom - x_ht_acceptance_tolerance <= max_bottom &&
          min_top > kBlnBaselineOffset &&
          max_top - kBlnBaselineOffset >= kBlnXHeight &&
          misfit_dist > 0) {
        // Compute the x-height position using proportionality between the
        // actual height and expected height.
        int min_xht = DivRounded(height * kBlnXHeight,
                                 max_top - kBlnBaselineOffset);
        int max_xht = DivRounded(height * kBlnXHeight,
                                 min_top - kBlnBaselineOffset);
        if (debug_x_ht_level >= 20) {
          tprintf(" xht range min=%d, max=%d\n",
                  min_xht, max_xht);
        }
        // The range of expected heights gets a vote equal to the distance
        // of the actual top from the expected top.
        for (int y = min_xht; y <= max_xht; ++y)
          top_stats.add(y, misfit_dist);
      } else if (debug_x_ht_level >= 20) {
        tprintf(" already OK\n");
      }
    }
  }
  if (top_stats.get_total() == 0)
    return 0.0f;
  // The new xheight is just the median vote, which is then scaled out
  // of BLN space back to pixel space to get the x-height in pixel space.
  float new_xht = top_stats.median();
  if (debug_x_ht_level >= 20) {
    tprintf("Median xht=%f\n", new_xht);
    tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n",
            new_xht, new_xht / word_res->denorm.y_scale());
  }
  // The xheight must change by at least x_ht_min_change to be used.
  if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change)
    return new_xht / word_res->denorm.y_scale();
  else
    return 0.0f;
}

Esempio n. 18

0

Mostra file

File: fixxht.cpp Progetto: 0ximDigital/appsScanner

// Returns a new x-height maximally compatible with the result in word_res.
// See comment above for overall algorithm.
float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res,
                                          float* baseline_shift) {
  STATS top_stats(0, MAX_UINT8);
  STATS shift_stats(-MAX_UINT8, MAX_UINT8);
  int bottom_shift = 0;
  int num_blobs = word_res->rebuild_word->NumBlobs();
  do {
    top_stats.clear();
    shift_stats.clear();
    for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
      TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
      UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
      if (unicharset.get_isalpha(class_id) ||
          unicharset.get_isdigit(class_id)) {
        int top = blob->bounding_box().top() + bottom_shift;
        // Clip the top to the limit of normalized feature space.
        if (top >= INT_FEAT_RANGE)
          top = INT_FEAT_RANGE - 1;
        int bottom = blob->bounding_box().bottom() + bottom_shift;
        int min_bottom, max_bottom, min_top, max_top;
        unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
                                  &min_top, &max_top);
        // Chars with a wild top range would mess up the result so ignore them.
        if (max_top - min_top > kMaxCharTopRange)
          continue;
        int misfit_dist = MAX((min_top - x_ht_acceptance_tolerance) - top,
                            top - (max_top + x_ht_acceptance_tolerance));
        int height = top - kBlnBaselineOffset;
        if (debug_x_ht_level >= 2) {
          tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
                  unicharset.id_to_unichar(class_id),
                  height, min_bottom, max_bottom, min_top, max_top,
                  bottom, top);
        }
        // Use only chars that fit in the expected bottom range, and where
        // the range of tops is sensibly near the xheight.
        if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
            bottom - x_ht_acceptance_tolerance <= max_bottom &&
            min_top > kBlnBaselineOffset &&
            max_top - kBlnBaselineOffset >= kBlnXHeight &&
            misfit_dist > 0) {
          // Compute the x-height position using proportionality between the
          // actual height and expected height.
          int min_xht = DivRounded(height * kBlnXHeight,
                                   max_top - kBlnBaselineOffset);
          int max_xht = DivRounded(height * kBlnXHeight,
                                   min_top - kBlnBaselineOffset);
          if (debug_x_ht_level >= 2) {
            tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
          }
          // The range of expected heights gets a vote equal to the distance
          // of the actual top from the expected top.
          for (int y = min_xht; y <= max_xht; ++y)
            top_stats.add(y, misfit_dist);
        } else if ((min_bottom > bottom + x_ht_acceptance_tolerance ||
                    bottom - x_ht_acceptance_tolerance > max_bottom) &&
                   bottom_shift == 0) {
          // Get the range of required bottom shift.
          int min_shift = min_bottom - bottom;
          int max_shift = max_bottom - bottom;
          if (debug_x_ht_level >= 2) {
            tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift);
          }
          // The range of expected shifts gets a vote equal to the min distance
          // of the actual bottom from the expected bottom, spread over the
          // range of its acceptance.
          int misfit_weight = abs(min_shift);
          if (max_shift > min_shift)
            misfit_weight /= max_shift - min_shift;
          for (int y = min_shift; y <= max_shift; ++y)
            shift_stats.add(y, misfit_weight);
        } else {
          if (bottom_shift == 0) {
            // Things with bottoms that are already ok need to say so, on the
            // 1st iteration only.
            shift_stats.add(0, kBlnBaselineOffset);
          }
          if (debug_x_ht_level >= 2) {
            tprintf(" already OK\n");
          }
        }
      }
    }
    if (shift_stats.get_total() > top_stats.get_total()) {
      bottom_shift = IntCastRounded(shift_stats.median());
      if (debug_x_ht_level >= 2) {
        tprintf("Applying bottom shift=%d\n", bottom_shift);
      }
    }
  } while (bottom_shift != 0 &&
           top_stats.get_total() < shift_stats.get_total());
  // Baseline shift is opposite sign to the bottom shift.
  *baseline_shift = -bottom_shift / word_res->denorm.y_scale();
  if (debug_x_ht_level >= 2) {
    tprintf("baseline shift=%g\n", *baseline_shift);
  }
  if (top_stats.get_total() == 0)
    return bottom_shift != 0 ? word_res->x_height : 0.0f;
  // The new xheight is just the median vote, which is then scaled out
  // of BLN space back to pixel space to get the x-height in pixel space.
  float new_xht = top_stats.median();
  if (debug_x_ht_level >= 2) {
    tprintf("Median xht=%f\n", new_xht);
    tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n",
            new_xht, new_xht / word_res->denorm.y_scale());
  }
  // The xheight must change by at least x_ht_min_change to be used.
  if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change)
    return new_xht / word_res->denorm.y_scale();
  else
    return bottom_shift != 0 ? word_res->x_height : 0.0f;
}

Esempio n. 19

0

Mostra file

File: fixspace.cpp Progetto: ErfanHasmin/scope-ocr

/**
 * @name uniformly_spaced()
 * Return true if one of the following are true:
 * - All inter-char gaps are the same width
 * - The largest gap is no larger than twice the mean/median of the others
 * - The largest gap is < normalised_max_nonspace
 * **** REMEMBER - WE'RE NOW WORKING WITH A BLN WERD !!!
 */
BOOL8 Tesseract::uniformly_spaced(WERD_RES *word) {
  TBOX box;
  inT16 prev_right = -MAX_INT16;
  inT16 gap;
  inT16 max_gap = -MAX_INT16;
  inT16 max_gap_count = 0;
  STATS gap_stats(0, MAXSPACING);
  BOOL8 result;
  const ROW *row = word->denorm.row();
  float max_non_space;
  float normalised_max_nonspace;
  inT16 i = 0;
  inT16 offset = 0;
  STRING punct_chars = "\"`',.:;";

  for (TBLOB* blob = word->rebuild_word->blobs; blob != NULL;
       blob = blob->next) {
    box = blob->bounding_box();
    if ((prev_right > -MAX_INT16) &&
        (!punct_chars.contains(
             word->best_choice->unichar_string()
                 [offset - word->best_choice->unichar_lengths()[i - 1]]) &&
         !punct_chars.contains(
             word->best_choice->unichar_string()[offset]))) {
      gap = box.left() - prev_right;
      if (gap < max_gap) {
        gap_stats.add(gap, 1);
      } else if (gap == max_gap) {
        max_gap_count++;
      } else {
        if (max_gap_count > 0)
          gap_stats.add(max_gap, max_gap_count);
        max_gap = gap;
        max_gap_count = 1;
      }
    }
    prev_right = box.right();
    offset += word->best_choice->unichar_lengths()[i++];
  }

  max_non_space = (row->space() + 3 * row->kern()) / 4;
  normalised_max_nonspace = max_non_space * kBlnXHeight / row->x_height();

  result = (
      gap_stats.get_total() == 0 ||
      max_gap <= normalised_max_nonspace ||
      (gap_stats.get_total() > 2 && max_gap <= 2 * gap_stats.median()) ||
      (gap_stats.get_total() <= 2 && max_gap <= 2 * gap_stats.mean()));
  #ifndef SECURE_NAMES
  if ((debug_fix_space_level > 1)) {
    if (result) {
      tprintf(
          "ACCEPT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d "
          "total=%d mean=%f median=%f\n",
          word->best_choice->unichar_string().string(), normalised_max_nonspace,
          max_gap, max_gap_count, gap_stats.get_total(), gap_stats.mean(),
          gap_stats.median());
    } else {
      tprintf(
          "REJECT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d "
          "total=%d mean=%f median=%f\n",
          word->best_choice->unichar_string().string(), normalised_max_nonspace,
          max_gap, max_gap_count, gap_stats.get_total(), gap_stats.mean(),
          gap_stats.median());
    }
  }
  #endif

  return result;
}

Esempio n. 20

0

Mostra file

File: classify.cpp Progetto: vkbrad/AndroidOCR

// Returns true if the blob is small enough to be a large speckle.
bool Classify::LargeSpeckle(const TBLOB &blob) {
  double speckle_size = kBlnXHeight * speckle_large_max_size;
  TBOX bbox = blob.bounding_box();
  return bbox.width() < speckle_size && bbox.height() < speckle_size;
}

Esempio n. 21

0

Mostra file

File: split.cpp Progetto: selgod/ExpenseReportManager

// Returns true if *this SPLIT appears OK in the sense that it does not cross
// any outlines and does not chop off any ridiculously small pieces.
bool SPLIT::IsHealthy(const TBLOB& blob, int min_points, int min_area) const {
  return !IsLittleChunk(min_points, min_area) &&
         !blob.SegmentCrossesOutline(point1->pos, point2->pos);
}

Esempio n. 22

0

Mostra file

File: blobs.cpp Progetto: coffeesam/tesseract-ocr

// Recomputes the bounding boxes of the blobs.
void TWERD::ComputeBoundingBoxes() {
  for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) {
    blob->ComputeBoundingBoxes();
  }
}

Esempio n. 23

0

Mostra file

File: ratngs.cpp Progetto: 11110101/tess-two

// Sets up the script_pos_ member using the blobs_list to get the bln
// bounding boxes, *this to get the unichars, and this->unicharset
// to get the target positions. If small_caps is true, sub/super are not
// considered, but dropcaps are.
// NOTE: blobs_list should be the chopped_word blobs. (Fully segemented.)
void WERD_CHOICE::SetScriptPositions(bool small_caps, TWERD* word) {
  // Since WERD_CHOICE isn't supposed to depend on a Tesseract,
  // we don't have easy access to the flags Tesseract stores.  Therefore, debug
  // for this module is hard compiled in.
  int debug = 0;

  // Initialize to normal.
  for (int i = 0; i < length_; ++i)
    script_pos_[i] = tesseract::SP_NORMAL;
  if (word->blobs.empty() || word->NumBlobs() != TotalOfStates()) {
    return;
  }

  int position_counts[4];
  for (int i = 0; i < 4; i++) {
    position_counts[i] = 0;
  }

  int chunk_index = 0;
  for (int blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) {
    TBLOB* tblob = word->blobs[chunk_index];
    int uni_id = unichar_id(blob_index);
    TBOX blob_box = tblob->bounding_box();
    if (state_ != NULL) {
      for (int i = 1; i <  state_[blob_index]; ++i) {
        ++chunk_index;
        tblob = word->blobs[chunk_index];
        blob_box += tblob->bounding_box();
      }
    }
    script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box,
                                               uni_id);
    if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) {
      script_pos_[blob_index] = tesseract::SP_NORMAL;
    }
    position_counts[script_pos_[blob_index]]++;
  }
  // If almost everything looks like a superscript or subscript,
  // we most likely just got the baseline wrong.
  if (position_counts[tesseract::SP_SUBSCRIPT] > 0.75 * length_ ||
      position_counts[tesseract::SP_SUPERSCRIPT] > 0.75 * length_) {
    if (debug >= 2) {
      tprintf("Most characters of %s are subscript or superscript.\n"
              "That seems wrong, so I'll assume we got the baseline wrong\n",
              unichar_string().string());
    }
    for (int i = 0; i < length_; i++) {
      ScriptPos sp = script_pos_[i];
      if (sp == tesseract::SP_SUBSCRIPT || sp == tesseract::SP_SUPERSCRIPT) {
        position_counts[sp]--;
        position_counts[tesseract::SP_NORMAL]++;
        script_pos_[i] = tesseract::SP_NORMAL;
      }
    }
  }

  if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) ||
      debug >= 2) {
    tprintf("SetScriptPosition on %s\n", unichar_string().string());
    int chunk_index = 0;
    for (int blob_index = 0; blob_index < length_; ++blob_index) {
      if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) {
        TBLOB* tblob = word->blobs[chunk_index];
        ScriptPositionOf(true, *unicharset_, tblob->bounding_box(),
                         unichar_id(blob_index));
      }
      chunk_index += state_ != NULL ? state_[blob_index] : 1;
    }
  }
}

Esempio n. 24

0

Mostra file

File: blobs.cpp Progetto: ErwcPKerr/node-dv

// Normalize in-place using the DENORM.
void TWERD::Normalize(const DENORM& denorm) {
  for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) {
    blob->Normalize(denorm);
  }
}

Esempio n. 25

0

Mostra file

File: reject.cpp Progetto: xmarston/BillRecognizer

// Note: After running this function word_res->ratings
// might not contain the right BLOB_CHOICE corresponding to each character
// in word_res->best_choice.
void Tesseract::flip_0O(WERD_RES *word_res) {
  WERD_CHOICE *best_choice = word_res->best_choice;
  int i;
  TBOX out_box;

  if (!tessedit_flip_0O)
    return;

  int num_blobs = word_res->rebuild_word->NumBlobs();
  for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
    TBLOB* blob = word_res->rebuild_word->blobs[i];
    if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
        word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
      out_box = blob->bounding_box();
      if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
        (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4))
        return;                  //Beware words with sub/superscripts
    }
  }
  UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
  UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
  if (unichar_0 == INVALID_UNICHAR_ID ||
      !word_res->uch_set->get_enabled(unichar_0) ||
      unichar_O == INVALID_UNICHAR_ID ||
      !word_res->uch_set->get_enabled(unichar_O)) {
    return;  // 0 or O are not present/enabled in unicharset
  }
  for (i = 1; i < best_choice->length(); ++i) {
    if (best_choice->unichar_id(i) == unichar_0 ||
        best_choice->unichar_id(i) == unichar_O) {
      /* A0A */
      if ((i+1) < best_choice->length() &&
          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) {
        best_choice->set_unichar_id(unichar_O, i);
      }
      /* A00A */
      if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
          (i+1) < best_choice->length() &&
          (best_choice->unichar_id(i+1) == unichar_0 ||
           best_choice->unichar_id(i+1) == unichar_O) &&
          (i+2) < best_choice->length() &&
          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) {
        best_choice->set_unichar_id(unichar_O, i);
        i++;
      }
      /* AA0<non digit or end of word> */
      if ((i > 1) &&
          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) &&
          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
          (((i+1) < best_choice->length() &&
            !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) &&
            !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") &&
            !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) ||
           (i == best_choice->length() - 1))) {
        best_choice->set_unichar_id(unichar_O, i);
      }
      /* 9O9 */
      if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
          (i+1) < best_choice->length() &&
          non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) {
        best_choice->set_unichar_id(unichar_0, i);
      }
      /* 9OOO */
      if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
          (i+2) < best_choice->length() &&
          (best_choice->unichar_id(i+1) == unichar_0 ||
           best_choice->unichar_id(i+1) == unichar_O) &&
          (best_choice->unichar_id(i+2) == unichar_0 ||
           best_choice->unichar_id(i+2) == unichar_O)) {
        best_choice->set_unichar_id(unichar_0, i);
        best_choice->set_unichar_id(unichar_0, i+1);
        best_choice->set_unichar_id(unichar_0, i+2);
        i += 2;
      }
      /* 9OO<non upper> */
      if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
          (i+2) < best_choice->length() &&
          (best_choice->unichar_id(i+1) == unichar_0 ||
          best_choice->unichar_id(i+1) == unichar_O) &&
          !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) {
        best_choice->set_unichar_id(unichar_0, i);
        best_choice->set_unichar_id(unichar_0, i+1);
        i++;
      }
      /* 9O<non upper> */
      if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
          (i+1) < best_choice->length() &&
          !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) {
        best_choice->set_unichar_id(unichar_0, i);
      }
      /* 9[.,]OOO.. */
      if ((i > 1) &&
          (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") ||
              word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) &&
          (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) ||
           best_choice->unichar_id(i-2) == unichar_O)) {
        if (best_choice->unichar_id(i-2) == unichar_O) {
          best_choice->set_unichar_id(unichar_0, i-2);
        }
        while (i < best_choice->length() &&
               (best_choice->unichar_id(i) == unichar_O ||
                best_choice->unichar_id(i) == unichar_0)) {
          best_choice->set_unichar_id(unichar_0, i);
          i++;
        }
        i--;
      }
    }
  }
}

Esempio n. 26

0

Mostra file

File: superscript.cpp Progetto: 0ximDigital/appsScanner

/**
 * Return whether this is believable superscript or subscript text.
 *
 * We insist that:
 *   + there are no punctuation marks.
 *   + there are no italics.
 *   + no normal-sized character is smaller than superscript_scaledown_ratio
 *     of what it ought to be, and
 *   + each character is at least as certain as certainty_threshold.
 *
 *  @param[in]  debug  If true, spew debug output
 *  @param[in]  word   The word whose best_choice we're evaluating
 *  @param[in]  certainty_threshold   If any of the characters have less
 *                    certainty than this, reject.
 *  @param[out]  left_ok  How many left-side characters were ok?
 *  @param[out]  right_ok  How many right-side characters were ok?
 *  @return  Whether the complete best choice is believable as a superscript.
 */
bool Tesseract::BelievableSuperscript(bool debug,
                                      const WERD_RES &word,
                                      float certainty_threshold,
                                      int *left_ok,
                                      int *right_ok) const {
  int initial_ok_run_count = 0;
  int ok_run_count = 0;
  float worst_certainty = 0.0f;
  const WERD_CHOICE &wc = *word.best_choice;

  const UnicityTable<FontInfo>& fontinfo_table = get_fontinfo_table();
  for (int i = 0; i < wc.length(); i++) {
    TBLOB *blob = word.rebuild_word->blobs[i];
    UNICHAR_ID unichar_id = wc.unichar_id(i);
    float char_certainty = wc.certainty(i);
    bool bad_certainty = char_certainty < certainty_threshold;
    bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id);
    bool is_italic = word.fontinfo && word.fontinfo->is_italic();
    BLOB_CHOICE *choice = word.GetBlobChoice(i);
    if (choice && fontinfo_table.size() > 0) {
      // Get better information from the specific choice, if available.
      int font_id1 = choice->fontinfo_id();
      bool font1_is_italic = font_id1 >= 0
          ? fontinfo_table.get(font_id1).is_italic() : false;
      int font_id2 = choice->fontinfo_id2();
      is_italic = font1_is_italic &&
          (font_id2 < 0 || fontinfo_table.get(font_id2).is_italic());
    }

    float height_fraction = 1.0f;
    float char_height = blob->bounding_box().height();
    float normal_height = char_height;
    if (wc.unicharset()->top_bottom_useful()) {
      int min_bot, max_bot, min_top, max_top;
      wc.unicharset()->get_top_bottom(unichar_id,
                                      &min_bot, &max_bot,
                                      &min_top, &max_top);
      float hi_height = max_top - max_bot;
      float lo_height = min_top - min_bot;
      normal_height = (hi_height + lo_height) / 2;
      if (normal_height >= kBlnXHeight) {
        // Only ding characters that we have decent information for because
        // they're supposed to be normal sized, not tiny specks or dashes.
        height_fraction = char_height / normal_height;
      }
    }
    bool bad_height = height_fraction < superscript_scaledown_ratio;

    if (debug) {
      if (is_italic) {
        tprintf(" Rejecting: superscript is italic.\n");
      }
      if (is_punc) {
        tprintf(" Rejecting: punctuation present.\n");
      }
      const char *char_str = wc.unicharset()->id_to_unichar(unichar_id);
      if (bad_certainty) {
        tprintf(" Rejecting: don't believe character %s with certainty %.2f "
                "which is less than threshold %.2f\n", char_str,
                char_certainty, certainty_threshold);
      }
      if (bad_height) {
        tprintf(" Rejecting: character %s seems too small @ %.2f versus "
                "expected %.2f\n", char_str, char_height, normal_height);
      }
    }
    if (bad_certainty || bad_height || is_punc || is_italic) {
      if (ok_run_count == i) {
        initial_ok_run_count = ok_run_count;
      }
      ok_run_count = 0;
    } else {
      ok_run_count++;
    }
    if (char_certainty < worst_certainty) {
      worst_certainty = char_certainty;
    }
  }
  bool all_ok = ok_run_count == wc.length();
  if (all_ok && debug) {
    tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty);
  }
  if (!all_ok) {
    if (left_ok) *left_ok = initial_ok_run_count;
    if (right_ok) *right_ok = ok_run_count;
  }
  return all_ok;
}