void AssociateUtils::ComputeStats(int col, int row, const AssociateStats *parent_stats, int parent_path_length, bool fixed_pitch, float max_char_wh_ratio, WERD_RES *word_res, bool debug, AssociateStats *stats) { stats->Clear(); ASSERT_HOST(word_res != NULL); if (word_res->blob_widths.empty()) { return; } if (debug) { tprintf("AssociateUtils::ComputeStats() for col=%d, row=%d%s\n", col, row, fixed_pitch ? " (fixed pitch)" : ""); } float normalizing_height = kBlnXHeight; ROW* blob_row = word_res->blob_row; // TODO(rays/daria) Can unicharset.script_has_xheight be useful here? if (fixed_pitch && blob_row != NULL) { // For fixed pitch language like CJK, we use the full text height // as the normalizing factor so we are not dependent on xheight // calculation. if (blob_row->body_size() > 0.0f) { normalizing_height = word_res->denorm.y_scale() * blob_row->body_size(); } else { normalizing_height = word_res->denorm.y_scale() * (blob_row->x_height() + blob_row->ascenders()); } if (debug) { tprintf("normalizing height = %g (scale %g xheight %g ascenders %g)\n", normalizing_height, word_res->denorm.y_scale(), blob_row->x_height(), blob_row->ascenders()); } } float wh_ratio = word_res->GetBlobsWidth(col, row) / normalizing_height; if (wh_ratio > max_char_wh_ratio) stats->bad_shape = true; // Compute the gap sum for this shape. If there are only negative or only // positive gaps, record their sum in stats->gap_sum. However, if there is // a mixture, record only the sum of the positive gaps. // TODO(antonova): explain fragment. int negative_gap_sum = 0; for (int c = col; c < row; ++c) { int gap = word_res->GetBlobsGap(c); (gap > 0) ? stats->gap_sum += gap : negative_gap_sum += gap; } if (stats->gap_sum == 0) stats->gap_sum = negative_gap_sum; if (debug) { tprintf("wh_ratio=%g (max_char_wh_ratio=%g) gap_sum=%d %s\n", wh_ratio, max_char_wh_ratio, stats->gap_sum, stats->bad_shape ? "bad_shape" : ""); } // Compute shape_cost (for fixed pitch mode). if (fixed_pitch) { bool end_row = (row == (word_res->ratings->dimension() - 1)); // Ensure that the blob has gaps on the left and the right sides // (except for beginning and ending punctuation) and that there is // no cutting through ink at the blob boundaries. if (col > 0) { float left_gap = word_res->GetBlobsGap(col - 1) / normalizing_height; SEAM *left_seam = word_res->seam_array[col - 1]; if ((!end_row && left_gap < kMinGap) || left_seam->priority > 0.0f) { stats->bad_shape = true; } if (debug) { tprintf("left_gap %g, left_seam %g %s\n", left_gap, left_seam->priority, stats->bad_shape ? "bad_shape" : ""); } } float right_gap = 0.0f; if (!end_row) { right_gap = word_res->GetBlobsGap(row) / normalizing_height; SEAM *right_seam = word_res->seam_array[row]; if (right_gap < kMinGap || right_seam->priority > 0.0f) { stats->bad_shape = true; if (right_gap < kMinGap) stats->bad_fixed_pitch_right_gap = true; } if (debug) { tprintf("right_gap %g right_seam %g %s\n", right_gap, right_seam->priority, stats->bad_shape ? "bad_shape" : ""); } } // Impose additional segmentation penalties if blob widths or gaps // distribution don't fit a fixed-pitch model. // Since we only know the widths and gaps of the path explored so far, // the means and variances are computed for the path so far (not // considering characters to the right of the last character on the path). stats->full_wh_ratio = wh_ratio + right_gap; if (parent_stats != NULL) { stats->full_wh_ratio_total = (parent_stats->full_wh_ratio_total + stats->full_wh_ratio); float mean = stats->full_wh_ratio_total / static_cast<float>(parent_path_length+1); stats->full_wh_ratio_var = parent_stats->full_wh_ratio_var + pow(mean-stats->full_wh_ratio, 2); } else { stats->full_wh_ratio_total = stats->full_wh_ratio; } if (debug) { tprintf("full_wh_ratio %g full_wh_ratio_total %g full_wh_ratio_var %g\n", stats->full_wh_ratio, stats->full_wh_ratio_total, stats->full_wh_ratio_var); } stats->shape_cost = FixedPitchWidthCost(wh_ratio, right_gap, end_row, max_char_wh_ratio); // For some reason Tesseract prefers to treat the whole CJ words // as one blob when the initial segmentation is particularly bad. // This hack is to avoid favoring such states. if (col == 0 && end_row && wh_ratio > max_char_wh_ratio) { stats->shape_cost += 10; } stats->shape_cost += stats->full_wh_ratio_var; if (debug) tprintf("shape_cost %g\n", stats->shape_cost); } }
void AssociateUtils::ComputeStats(int col, int row, const AssociateStats *parent_stats, int parent_path_length, bool fixed_pitch, float max_char_wh_ratio, const DENORM *denorm, CHUNKS_RECORD *chunks_record, int debug_level, AssociateStats *stats) { stats->Clear(); if (debug_level > 0) { tprintf("AssociateUtils::ComputeStats() for col=%d, row=%d%s\n", col, row, fixed_pitch ? " (fixed pitch)" : ""); } float normalizing_height = BASELINE_SCALE; // TODO(rays/daria) Can unicharset.script_has_xheight be useful here? if (fixed_pitch && denorm != NULL && denorm->row() != NULL) { // For fixed pitch language like CJK, we use the full text height // as the normalizing factor so we are not dependent on xheight // calculation. if (denorm->row()->body_size() > 0.0f) { normalizing_height = denorm->y_scale() * denorm->row()->body_size(); } else { normalizing_height = denorm->y_scale() * (denorm->row()->x_height() + denorm->row()->ascenders()); } if (debug_level > 0) { tprintf("normalizing height = %g (scale %g xheight %g ascenders %g)\n", normalizing_height, denorm->y_scale(), denorm->row()->x_height(), denorm->row()->ascenders()); } } float wh_ratio = GetChunksWidth(chunks_record->chunk_widths, col, row) / normalizing_height; if (debug_level) tprintf("wh_ratio %g\n", wh_ratio); if (wh_ratio > max_char_wh_ratio) stats->bad_shape = true; if (fixed_pitch) { bool end_row = (row == (chunks_record->ratings->dimension() - 1)); // Ensure that the blob has gaps on the left and the right sides // (except for beginning and ending punctuation) and that there is // no cutting through ink at the blob boundaries. if (col > 0) { float left_gap = GetChunksGap(chunks_record->chunk_widths, col-1) / normalizing_height; SEAM *left_seam = static_cast<SEAM *>(array_value(chunks_record->splits, col-1)); if (debug_level) { tprintf("left_gap %g, left_seam %g\n", left_gap, left_seam->priority); } if ((!end_row && left_gap < kMinGap) || left_seam->priority > 0.0f) { stats->bad_shape = true; } } float right_gap = 0.0f; if (!end_row) { right_gap = GetChunksGap(chunks_record->chunk_widths, row) / normalizing_height; SEAM *right_seam = static_cast<SEAM *>(array_value(chunks_record->splits, row)); if (debug_level) { tprintf("right_gap %g right_seam %g\n", right_gap, right_seam->priority); } if (right_gap < kMinGap || right_seam->priority > 0.0f) { stats->bad_shape = true; if (right_gap < kMinGap) stats->bad_fixed_pitch_right_gap = true; } } // Impose additional segmentation penalties if blob widths or gaps // distribution don't fit a fixed-pitch model. // Since we only know the widths and gaps of the path explored so far, // the means and variances are computed for the path so far (not // considering characters to the right of the last character on the path). stats->full_wh_ratio = wh_ratio + right_gap; if (parent_stats != NULL) { stats->full_wh_ratio_total = (parent_stats->full_wh_ratio_total + stats->full_wh_ratio); float mean = stats->full_wh_ratio_total / static_cast<float>(parent_path_length+1); stats->full_wh_ratio_var = parent_stats->full_wh_ratio_var + pow(mean-stats->full_wh_ratio, 2); } else { stats->full_wh_ratio_total = stats->full_wh_ratio; } if (debug_level) { tprintf("full_wh_ratio %g full_wh_ratio_total %g full_wh_ratio_var %g\n", stats->full_wh_ratio, stats->full_wh_ratio_total, stats->full_wh_ratio_var); } stats->shape_cost = FixedPitchWidthCost(wh_ratio, right_gap, end_row, max_char_wh_ratio); // For some reason Tesseract prefers to treat the whole CJ words // as one blob when the initial segmentation is particularly bad. // This hack is to avoid favoring such states. if (col == 0 && end_row && wh_ratio > max_char_wh_ratio) { stats->shape_cost += 10; } stats->shape_cost += stats->full_wh_ratio_var; if (debug_level) tprintf("shape_cost %g\n", stats->shape_cost); } }