Exemplo n.º 1
0
MATRIX *Wordrec::word_associator(WERD_RES *word,
                                 STATE *state,
                                 BLOB_CHOICE_LIST_VECTOR *best_char_choices,
                                 DANGERR *fixpt,
                                 STATE *best_state) {
  CHUNKS_RECORD chunks_record;
  BLOB_WEIGHTS blob_weights;
  int x;
  int num_chunks;
  BLOB_CHOICE_IT blob_choice_it;

  num_chunks = array_count(word->seam_array) + 1;

  TBLOB* blobs = word->chopped_word->blobs;
  chunks_record.chunks = blobs;
  chunks_record.splits = word->seam_array;
  chunks_record.ratings = record_piece_ratings (blobs);
  chunks_record.char_widths = blobs_widths (blobs);
  chunks_record.chunk_widths = blobs_widths (blobs);
  /* Save chunk weights */
  for (x = 0; x < num_chunks; x++) {
    BLOB_CHOICE_LIST* choices = get_piece_rating(chunks_record.ratings,
                                                 blobs, word->seam_array, x, x);
    blob_choice_it.set_to_list(choices);
    //This is done by Jetsoft. Divide by zero is possible.
    if (blob_choice_it.data()->certainty() == 0) {
      blob_weights[x]=0;
    } else {
      blob_weights[x] =
        -(inT16) (10 * blob_choice_it.data()->rating() /
                  blob_choice_it.data()->certainty());
    }
  }
  chunks_record.weights = blob_weights;

  if (chop_debug)
    chunks_record.ratings->print(getDict().getUnicharset());

  if (enable_new_segsearch) {
    SegSearch(&chunks_record, word->best_choice,
              best_char_choices, word->raw_choice, state);
  } else {
    best_first_search(&chunks_record, best_char_choices, word,
                      state, fixpt, best_state);
  }

  free_widths (chunks_record.chunk_widths);
  free_widths (chunks_record.char_widths);
  return chunks_record.ratings;
}
/**********************************************************************
 * width_priority
 *
 * Return a priority value for this word segmentation based on the
 * character widths present in the new segmentation.
 * For variable-pitch fonts, this should do the same thing as before:
 * ie. penalize only on really wide squatting blobs.
 * For fixed-pitch fonts, this will include a measure of char & gap
 * width consistency.
 * TODO(dsl): generalize this to use a PDF estimate for proportional and
 * fixed pitch mode.
 **********************************************************************/
FLOAT32 Wordrec::width_priority(CHUNKS_RECORD *chunks_record,
                                STATE *state,
                                int num_joints) {
  FLOAT32 penalty = 0.0;
  WIDTH_RECORD *width_rec = state_char_widths(chunks_record->chunk_widths,
                                              state, num_joints);
  // When baseline_enable==True, which is the current default for Tesseract,
  // a fixed value of 128 (BASELINE_SCALE) is always used.
  FLOAT32 normalizing_height = BASELINE_SCALE;
  if (!classify_baseline_normalized)   // this doesn't work and is never invoked
    normalizing_height = chunks_record->row->lineheight;
  if (assume_fixed_pitch_char_segment) {
    // For fixed pitch language like CJK, we use the full text height as the
    // normalizing factor so we are not dependent on xheight calculation.
    // In the normalized coord. xheight * scale == BASELINE_SCALE(128),
    // so add proportionally scaled ascender zone to get full text height.
    normalizing_height = tess_denorm->scale() *
        (tess_denorm->row()->x_height() + tess_denorm->row()->ascenders());
    if (segment_adjust_debug > 1)
      tprintf("WidthPriority: %f %f normalizing height = %f\n",
              tess_denorm->row()->x_height(), tess_denorm->row()->ascenders(),
              normalizing_height);
    // Impose additional segmentation penalties if blob widths or gaps
    // distribution don't fit a fixed-pitch model.
    FLOAT32 width_var = get_width_variance(width_rec, normalizing_height);
    FLOAT32 gap_var = get_gap_variance(width_rec, normalizing_height);
    penalty += width_var;
    penalty += gap_var;
  }

  for (int x = 0; x < width_rec->num_chars; x++) {
    FLOAT32 squat = width_rec->widths[2*x];
    FLOAT32 gap = (x < width_rec->num_chars-1) ? width_rec->widths[2*x+1] : 0;
    squat /= normalizing_height;
    gap /= normalizing_height;
    if (assume_fixed_pitch_char_segment) {
      penalty += fp_width_cost(squat, x == 0 || x == width_rec->num_chars -1);
      penalty += fp_gap_cost(gap, x == width_rec->num_chars - 1);
      if (width_rec->num_chars == 1 && squat > MAX_SQUAT)
        penalty += 10;
    } else {
      // original equation when heuristic_max_char_ratio == MAX_SQUAT
      if (squat > heuristic_max_char_wh_ratio)
        penalty += squat - heuristic_max_char_wh_ratio;
    }
  }

  free_widths(width_rec);
  return (penalty);
}
Exemplo n.º 3
0
/**
 * @name replace_char_widths
 *
 * Replace the value of the char_width field in the chunks_record with
 * the updated width measurements from the last_segmentation.
 */
void Wordrec::replace_char_widths(CHUNKS_RECORD *chunks_record,
                                  SEARCH_STATE state) {
  WIDTH_RECORD *width_record;
  int num_blobs;
  int i;

  free_widths (chunks_record->char_widths);

  num_blobs = state[0] + 1;
  width_record = (WIDTH_RECORD *) memalloc (sizeof (int) * num_blobs * 2);
  width_record->num_chars = num_blobs;

  for (i = 0; i < num_blobs; i++) {

    width_record->widths[2 * i] = last_segmentation[i].width;

    if (i + 1 < num_blobs)
      width_record->widths[2 * i + 1] = last_segmentation[i].gap;
  }
  chunks_record->char_widths = width_record;
}