// Returns true if the shape contains multiple different font properties, // ignoring unichar_id. bool Shape::ContainsMultipleFontProperties( const FontInfoTable& font_table) const { uinT32 properties = font_table.get(unichars_[0].font_ids[0]).properties; for (int c = 0; c < unichars_.size(); ++c) { GenericVector<int>& font_list = unichars_[c].font_ids; for (int f = 0; f < font_list.size(); ++f) { if (font_table.get(font_list[f]).properties != properties) return true; } } return false; }
// Creates a report of the error rate. The report_level controls the detail // that is reported to stderr via tprintf: // 0 -> no output. // >=1 -> bottom-line error rate. // >=3 -> font-level error rate. // boosting_mode determines the return value. It selects which (un-weighted) // error rate to return. // The fontinfo_table from MasterTrainer provides the names of fonts. // The it determines the current subset of the training samples. // If not NULL, the top-choice unichar error rate is saved in unichar_error. // If not NULL, the report string is saved in fonts_report. // (Ignoring report_level). double ErrorCounter::ReportErrors(int report_level, CountTypes boosting_mode, const FontInfoTable& fontinfo_table, const SampleIterator& it, double* unichar_error, STRING* fonts_report) { // Compute totals over all the fonts and report individual font results // when required. Counts totals; int fontsize = font_counts_.size(); for (int f = 0; f < fontsize; ++f) { // Accumulate counts over fonts. totals += font_counts_[f]; STRING font_report; if (ReportString(false, font_counts_[f], &font_report)) { if (fonts_report != NULL) { *fonts_report += fontinfo_table.get(f).name; *fonts_report += ": "; *fonts_report += font_report; *fonts_report += "\n"; } if (report_level > 2) { // Report individual font error rates. tprintf("%s: %s\n", fontinfo_table.get(f).name, font_report.string()); } } } // Report the totals. STRING total_report; bool any_results = ReportString(true, totals, &total_report); if (fonts_report != NULL && fonts_report->length() == 0) { // Make sure we return something even if there were no samples. *fonts_report = "NoSamplesFound: "; *fonts_report += total_report; *fonts_report += "\n"; } if (report_level > 0) { // Report the totals. STRING total_report; if (any_results) { tprintf("TOTAL Scaled Err=%.4g%%, %s\n", scaled_error_ * 100.0, total_report.string()); } // Report the worst substitution error only for now. if (totals.n[CT_UNICHAR_TOP1_ERR] > 0) { int charsetsize = unicharset_.size(); int worst_uni_id = 0; int worst_result_id = 0; int worst_err = 0; for (int u = 0; u < charsetsize; ++u) { for (int v = 0; v < charsetsize; ++v) { if (unichar_counts_(u, v) > worst_err) { worst_err = unichar_counts_(u, v); worst_uni_id = u; worst_result_id = v; } } } if (worst_err > 0) { tprintf("Worst error = %d:%s -> %s with %d/%d=%.2f%% errors\n", worst_uni_id, unicharset_.id_to_unichar(worst_uni_id), unicharset_.id_to_unichar(worst_result_id), worst_err, totals.n[CT_UNICHAR_TOP1_ERR], 100.0 * worst_err / totals.n[CT_UNICHAR_TOP1_ERR]); } } tprintf("Multi-unichar shape use:\n"); for (int u = 0; u < multi_unichar_counts_.size(); ++u) { if (multi_unichar_counts_[u] > 0) { tprintf("%d multiple answers for unichar: %s\n", multi_unichar_counts_[u], unicharset_.id_to_unichar(u)); } } tprintf("OK Score histogram:\n"); ok_score_hist_.print(); tprintf("ERROR Score histogram:\n"); bad_score_hist_.print(); } double rates[CT_SIZE]; if (!ComputeRates(totals, rates)) return 0.0; // Set output values if asked for. if (unichar_error != NULL) *unichar_error = rates[CT_UNICHAR_TOP1_ERR]; return rates[boosting_mode]; }
// Accumulates the errors from the classifier results on a single sample. // Returns true if debug is true and a CT_UNICHAR_TOPN_ERR error occurred. // boosting_mode selects the type of error to be used for boosting and the // is_error_ member of sample is set according to whether the required type // of error occurred. The font_table provides access to font properties // for error counting and shape_table is used to understand the relationship // between unichar_ids and shape_ids in the results bool ErrorCounter::AccumulateErrors(bool debug, CountTypes boosting_mode, const FontInfoTable& font_table, const GenericVector<UnicharRating>& results, TrainingSample* sample) { int num_results = results.size(); int answer_actual_rank = -1; int font_id = sample->font_id(); int unichar_id = sample->class_id(); sample->set_is_error(false); if (num_results == 0) { // Reject. We count rejects as a separate category, but still mark the // sample as an error in case any training module wants to use that to // improve the classifier. sample->set_is_error(true); ++font_counts_[font_id].n[CT_REJECT]; } else { // Find rank of correct unichar answer, using rating_epsilon_ to allow // different answers to score as equal. (Ignoring the font.) int epsilon_rank = 0; int answer_epsilon_rank = -1; int num_top_answers = 0; double prev_rating = results[0].rating; bool joined = false; bool broken = false; int res_index = 0; while (res_index < num_results) { if (results[res_index].rating < prev_rating - rating_epsilon_) { ++epsilon_rank; prev_rating = results[res_index].rating; } if (results[res_index].unichar_id == unichar_id && answer_epsilon_rank < 0) { answer_epsilon_rank = epsilon_rank; answer_actual_rank = res_index; } if (results[res_index].unichar_id == UNICHAR_JOINED && unicharset_.has_special_codes()) joined = true; else if (results[res_index].unichar_id == UNICHAR_BROKEN && unicharset_.has_special_codes()) broken = true; else if (epsilon_rank == 0) ++num_top_answers; ++res_index; } if (answer_actual_rank != 0) { // Correct result is not absolute top. ++font_counts_[font_id].n[CT_UNICHAR_TOPTOP_ERR]; if (boosting_mode == CT_UNICHAR_TOPTOP_ERR) sample->set_is_error(true); } if (answer_epsilon_rank == 0) { ++font_counts_[font_id].n[CT_UNICHAR_TOP_OK]; // Unichar OK, but count if multiple unichars. if (num_top_answers > 1) { ++font_counts_[font_id].n[CT_OK_MULTI_UNICHAR]; ++multi_unichar_counts_[unichar_id]; } // Check to see if any font in the top choice has attributes that match. // TODO(rays) It is easy to add counters for individual font attributes // here if we want them. if (font_table.SetContainsFontProperties( font_id, results[answer_actual_rank].fonts)) { // Font attributes were matched. // Check for multiple properties. if (font_table.SetContainsMultipleFontProperties( results[answer_actual_rank].fonts)) ++font_counts_[font_id].n[CT_OK_MULTI_FONT]; } else { // Font attributes weren't matched. ++font_counts_[font_id].n[CT_FONT_ATTR_ERR]; } } else { // This is a top unichar error. ++font_counts_[font_id].n[CT_UNICHAR_TOP1_ERR]; if (boosting_mode == CT_UNICHAR_TOP1_ERR) sample->set_is_error(true); // Count maps from unichar id to wrong unichar id. ++unichar_counts_(unichar_id, results[0].unichar_id); if (answer_epsilon_rank < 0 || answer_epsilon_rank >= 2) { // It is also a 2nd choice unichar error. ++font_counts_[font_id].n[CT_UNICHAR_TOP2_ERR]; if (boosting_mode == CT_UNICHAR_TOP2_ERR) sample->set_is_error(true); } if (answer_epsilon_rank < 0) { // It is also a top-n choice unichar error. ++font_counts_[font_id].n[CT_UNICHAR_TOPN_ERR]; if (boosting_mode == CT_UNICHAR_TOPN_ERR) sample->set_is_error(true); answer_epsilon_rank = epsilon_rank; } } // Compute mean number of return values and mean rank of correct answer. font_counts_[font_id].n[CT_NUM_RESULTS] += num_results; font_counts_[font_id].n[CT_RANK] += answer_epsilon_rank; if (joined) ++font_counts_[font_id].n[CT_OK_JOINED]; if (broken) ++font_counts_[font_id].n[CT_OK_BROKEN]; } // If it was an error for boosting then sum the weight. if (sample->is_error()) { scaled_error_ += sample->weight(); if (debug) { tprintf("%d results for char %s font %d :", num_results, unicharset_.id_to_unichar(unichar_id), font_id); for (int i = 0; i < num_results; ++i) { tprintf(" %.3f : %s\n", results[i].rating, unicharset_.id_to_unichar(results[i].unichar_id)); } return true; } int percent = 0; if (num_results > 0) percent = IntCastRounded(results[0].rating * 100); bad_score_hist_.add(percent, 1); } else { int percent = 0; if (answer_actual_rank >= 0) percent = IntCastRounded(results[answer_actual_rank].rating * 100); ok_score_hist_.add(percent, 1); } return false; }