MATRIX *Wordrec::word_associator(WERD_RES *word, STATE *state, BLOB_CHOICE_LIST_VECTOR *best_char_choices, DANGERR *fixpt, STATE *best_state) { CHUNKS_RECORD chunks_record; BLOB_WEIGHTS blob_weights; int x; int num_chunks; BLOB_CHOICE_IT blob_choice_it; num_chunks = array_count(word->seam_array) + 1; TBLOB* blobs = word->chopped_word->blobs; chunks_record.chunks = blobs; chunks_record.splits = word->seam_array; chunks_record.ratings = record_piece_ratings (blobs); chunks_record.char_widths = blobs_widths (blobs); chunks_record.chunk_widths = blobs_widths (blobs); /* Save chunk weights */ for (x = 0; x < num_chunks; x++) { BLOB_CHOICE_LIST* choices = get_piece_rating(chunks_record.ratings, blobs, word->seam_array, x, x); blob_choice_it.set_to_list(choices); //This is done by Jetsoft. Divide by zero is possible. if (blob_choice_it.data()->certainty() == 0) { blob_weights[x]=0; } else { blob_weights[x] = -(inT16) (10 * blob_choice_it.data()->rating() / blob_choice_it.data()->certainty()); } } chunks_record.weights = blob_weights; if (chop_debug) chunks_record.ratings->print(getDict().getUnicharset()); if (enable_new_segsearch) { SegSearch(&chunks_record, word->best_choice, best_char_choices, word->raw_choice, state); } else { best_first_search(&chunks_record, best_char_choices, word, state, fixpt, best_state); } free_widths (chunks_record.chunk_widths); free_widths (chunks_record.char_widths); return chunks_record.ratings; }
/********************************************************************** * width_priority * * Return a priority value for this word segmentation based on the * character widths present in the new segmentation. * For variable-pitch fonts, this should do the same thing as before: * ie. penalize only on really wide squatting blobs. * For fixed-pitch fonts, this will include a measure of char & gap * width consistency. * TODO(dsl): generalize this to use a PDF estimate for proportional and * fixed pitch mode. **********************************************************************/ FLOAT32 Wordrec::width_priority(CHUNKS_RECORD *chunks_record, STATE *state, int num_joints) { FLOAT32 penalty = 0.0; WIDTH_RECORD *width_rec = state_char_widths(chunks_record->chunk_widths, state, num_joints); // When baseline_enable==True, which is the current default for Tesseract, // a fixed value of 128 (BASELINE_SCALE) is always used. FLOAT32 normalizing_height = BASELINE_SCALE; if (!classify_baseline_normalized) // this doesn't work and is never invoked normalizing_height = chunks_record->row->lineheight; if (assume_fixed_pitch_char_segment) { // For fixed pitch language like CJK, we use the full text height as the // normalizing factor so we are not dependent on xheight calculation. // In the normalized coord. xheight * scale == BASELINE_SCALE(128), // so add proportionally scaled ascender zone to get full text height. normalizing_height = tess_denorm->scale() * (tess_denorm->row()->x_height() + tess_denorm->row()->ascenders()); if (segment_adjust_debug > 1) tprintf("WidthPriority: %f %f normalizing height = %f\n", tess_denorm->row()->x_height(), tess_denorm->row()->ascenders(), normalizing_height); // Impose additional segmentation penalties if blob widths or gaps // distribution don't fit a fixed-pitch model. FLOAT32 width_var = get_width_variance(width_rec, normalizing_height); FLOAT32 gap_var = get_gap_variance(width_rec, normalizing_height); penalty += width_var; penalty += gap_var; } for (int x = 0; x < width_rec->num_chars; x++) { FLOAT32 squat = width_rec->widths[2*x]; FLOAT32 gap = (x < width_rec->num_chars-1) ? width_rec->widths[2*x+1] : 0; squat /= normalizing_height; gap /= normalizing_height; if (assume_fixed_pitch_char_segment) { penalty += fp_width_cost(squat, x == 0 || x == width_rec->num_chars -1); penalty += fp_gap_cost(gap, x == width_rec->num_chars - 1); if (width_rec->num_chars == 1 && squat > MAX_SQUAT) penalty += 10; } else { // original equation when heuristic_max_char_ratio == MAX_SQUAT if (squat > heuristic_max_char_wh_ratio) penalty += squat - heuristic_max_char_wh_ratio; } } free_widths(width_rec); return (penalty); }
/** * @name replace_char_widths * * Replace the value of the char_width field in the chunks_record with * the updated width measurements from the last_segmentation. */ void Wordrec::replace_char_widths(CHUNKS_RECORD *chunks_record, SEARCH_STATE state) { WIDTH_RECORD *width_record; int num_blobs; int i; free_widths (chunks_record->char_widths); num_blobs = state[0] + 1; width_record = (WIDTH_RECORD *) memalloc (sizeof (int) * num_blobs * 2); width_record->num_chars = num_blobs; for (i = 0; i < num_blobs; i++) { width_record->widths[2 * i] = last_segmentation[i].width; if (i + 1 < num_blobs) width_record->widths[2 * i + 1] = last_segmentation[i].gap; } chunks_record->char_widths = width_record; }