Exemple #1
0
// Creates a report of the error rate. The report_level controls the detail
// that is reported to stderr via tprintf:
// 0   -> no output.
// >=1 -> bottom-line error rate.
// >=3 -> font-level error rate.
// boosting_mode determines the return value. It selects which (un-weighted)
// error rate to return.
// The fontinfo_table from MasterTrainer provides the names of fonts.
// The it determines the current subset of the training samples.
// If not NULL, the top-choice unichar error rate is saved in unichar_error.
// If not NULL, the report string is saved in fonts_report.
// (Ignoring report_level).
double ErrorCounter::ReportErrors(int report_level, CountTypes boosting_mode,
                                  const UnicityTable<FontInfo>& fontinfo_table,
                                  const SampleIterator& it,
                                  double* unichar_error,
                                  STRING* fonts_report) {
  // Compute totals over all the fonts and report individual font results
  // when required.
  Counts totals;
  int fontsize = font_counts_.size();
  for (int f = 0; f < fontsize; ++f) {
    // Accumulate counts over fonts.
    totals += font_counts_[f];
    STRING font_report;
    if (ReportString(font_counts_[f], &font_report)) {
      if (fonts_report != NULL) {
        *fonts_report += fontinfo_table.get(f).name;
        *fonts_report += ": ";
        *fonts_report += font_report;
        *fonts_report += "\n";
      }
      if (report_level > 2) {
        // Report individual font error rates.
        tprintf("%s: %s\n", fontinfo_table.get(f).name, font_report.string());
      }
    }
  }
  if (report_level > 0) {
    // Report the totals.
    STRING total_report;
    if (ReportString(totals, &total_report)) {
      tprintf("TOTAL Scaled Err=%.4g%%, %s\n",
              scaled_error_ * 100.0, total_report.string());
    }
    // Report the worst substitution error only for now.
    if (totals.n[CT_UNICHAR_TOP1_ERR] > 0) {
      const UNICHARSET& unicharset = it.shape_table()->unicharset();
      int charsetsize = unicharset.size();
      int shapesize = it.CompactCharsetSize();
      int worst_uni_id = 0;
      int worst_shape_id = 0;
      int worst_err = 0;
      for (int u = 0; u < charsetsize; ++u) {
        for (int s = 0; s < shapesize; ++s) {
          if (unichar_counts_(u, s) > worst_err) {
            worst_err = unichar_counts_(u, s);
            worst_uni_id = u;
            worst_shape_id = s;
          }
        }
      }
      if (worst_err > 0) {
        tprintf("Worst error = %d:%s -> %s with %d/%d=%.2f%% errors\n",
                worst_uni_id, unicharset.id_to_unichar(worst_uni_id),
                it.shape_table()->DebugStr(worst_shape_id).string(),
                worst_err, totals.n[CT_UNICHAR_TOP1_ERR],
                100.0 * worst_err / totals.n[CT_UNICHAR_TOP1_ERR]);
      }
    }
  }
  double rates[CT_SIZE];
  if (!ComputeRates(totals, rates))
    return 0.0;
  // Set output values if asked for.
  if (unichar_error != NULL)
    *unichar_error = rates[CT_UNICHAR_TOP1_ERR];
  return rates[boosting_mode];
}
// Creates a report of the error rate. The report_level controls the detail
// that is reported to stderr via tprintf:
// 0   -> no output.
// >=1 -> bottom-line error rate.
// >=3 -> font-level error rate.
// boosting_mode determines the return value. It selects which (un-weighted)
// error rate to return.
// The fontinfo_table from MasterTrainer provides the names of fonts.
// The it determines the current subset of the training samples.
// If not NULL, the top-choice unichar error rate is saved in unichar_error.
// If not NULL, the report string is saved in fonts_report.
// (Ignoring report_level).
double ErrorCounter::ReportErrors(int report_level, CountTypes boosting_mode,
                                  const FontInfoTable& fontinfo_table,
                                  const SampleIterator& it,
                                  double* unichar_error,
                                  STRING* fonts_report) {
  // Compute totals over all the fonts and report individual font results
  // when required.
  Counts totals;
  int fontsize = font_counts_.size();
  for (int f = 0; f < fontsize; ++f) {
    // Accumulate counts over fonts.
    totals += font_counts_[f];
    STRING font_report;
    if (ReportString(false, font_counts_[f], &font_report)) {
      if (fonts_report != NULL) {
        *fonts_report += fontinfo_table.get(f).name;
        *fonts_report += ": ";
        *fonts_report += font_report;
        *fonts_report += "\n";
      }
      if (report_level > 2) {
        // Report individual font error rates.
        tprintf("%s: %s\n", fontinfo_table.get(f).name, font_report.string());
      }
    }
  }
  // Report the totals.
  STRING total_report;
  bool any_results = ReportString(true, totals, &total_report);
  if (fonts_report != NULL && fonts_report->length() == 0) {
    // Make sure we return something even if there were no samples.
    *fonts_report = "NoSamplesFound: ";
    *fonts_report += total_report;
    *fonts_report += "\n";
  }
  if (report_level > 0) {
    // Report the totals.
    STRING total_report;
    if (any_results) {
      tprintf("TOTAL Scaled Err=%.4g%%, %s\n",
              scaled_error_ * 100.0, total_report.string());
    }
    // Report the worst substitution error only for now.
    if (totals.n[CT_UNICHAR_TOP1_ERR] > 0) {
      int charsetsize = unicharset_.size();
      int worst_uni_id = 0;
      int worst_result_id = 0;
      int worst_err = 0;
      for (int u = 0; u < charsetsize; ++u) {
        for (int v = 0; v < charsetsize; ++v) {
          if (unichar_counts_(u, v) > worst_err) {
            worst_err = unichar_counts_(u, v);
            worst_uni_id = u;
            worst_result_id = v;
          }
        }
      }
      if (worst_err > 0) {
        tprintf("Worst error = %d:%s -> %s with %d/%d=%.2f%% errors\n",
                worst_uni_id, unicharset_.id_to_unichar(worst_uni_id),
                unicharset_.id_to_unichar(worst_result_id),
                worst_err, totals.n[CT_UNICHAR_TOP1_ERR],
                100.0 * worst_err / totals.n[CT_UNICHAR_TOP1_ERR]);
      }
    }
    tprintf("Multi-unichar shape use:\n");
    for (int u = 0; u < multi_unichar_counts_.size(); ++u) {
      if (multi_unichar_counts_[u] > 0) {
        tprintf("%d multiple answers for unichar: %s\n",
                multi_unichar_counts_[u],
                unicharset_.id_to_unichar(u));
      }
    }
    tprintf("OK Score histogram:\n");
    ok_score_hist_.print();
    tprintf("ERROR Score histogram:\n");
    bad_score_hist_.print();
  }

  double rates[CT_SIZE];
  if (!ComputeRates(totals, rates))
    return 0.0;
  // Set output values if asked for.
  if (unichar_error != NULL)
    *unichar_error = rates[CT_UNICHAR_TOP1_ERR];
  return rates[boosting_mode];
}
Exemple #3
0
// Accumulates the errors from the classifier results on a single sample.
// Returns true if debug is true and a CT_UNICHAR_TOPN_ERR error occurred.
// boosting_mode selects the type of error to be used for boosting and the
// is_error_ member of sample is set according to whether the required type
// of error occurred. The font_table provides access to font properties
// for error counting and shape_table is used to understand the relationship
// between unichar_ids and shape_ids in the results
bool ErrorCounter::AccumulateErrors(bool debug, CountTypes boosting_mode,
                                    const UnicityTable<FontInfo>& font_table,
                                    const ShapeTable& shape_table,
                                    const GenericVector<ShapeRating>& results,
                                    TrainingSample* sample) {
  int num_results = results.size();
  int res_index = 0;
  bool debug_it = false;
  int font_id = sample->font_id();
  int unichar_id = sample->class_id();
  sample->set_is_error(false);
  if (num_results == 0) {
    // Reject. We count rejects as a separate category, but still mark the
    // sample as an error in case any training module wants to use that to
    // improve the classifier.
    sample->set_is_error(true);
    ++font_counts_[font_id].n[CT_REJECT];
  } else if (shape_table.GetShape(results[0].shape_id).
          ContainsUnicharAndFont(unichar_id, font_id)) {
    ++font_counts_[font_id].n[CT_SHAPE_TOP_CORRECT];
    // Unichar and font OK, but count if multiple unichars.
    if (shape_table.GetShape(results[0].shape_id).size() > 1)
      ++font_counts_[font_id].n[CT_OK_MULTI_UNICHAR];
  } else {
    // This is a top shape error.
    ++font_counts_[font_id].n[CT_SHAPE_TOP_ERR];
    // Check to see if any font in the top choice has attributes that match.
    bool attributes_match = false;
    uinT32 font_props = font_table.get(font_id).properties;
    const Shape& shape = shape_table.GetShape(results[0].shape_id);
    for (int c = 0; c < shape.size() && !attributes_match; ++c) {
      for (int f = 0; f < shape[c].font_ids.size(); ++f) {
        if (font_table.get(shape[c].font_ids[f]).properties == font_props) {
          attributes_match = true;
          break;
        }
      }
    }
    // TODO(rays) It is easy to add counters for individual font attributes
    // here if we want them.
    if (!attributes_match)
      ++font_counts_[font_id].n[CT_FONT_ATTR_ERR];
    if (boosting_mode == CT_SHAPE_TOP_ERR) sample->set_is_error(true);
    // Find rank of correct unichar answer. (Ignoring the font.)
    while (res_index < num_results &&
           !shape_table.GetShape(results[res_index].shape_id).
                ContainsUnichar(unichar_id)) {
      ++res_index;
    }
    if (res_index == 0) {
      // Unichar OK, but count if multiple unichars.
      if (shape_table.GetShape(results[res_index].shape_id).size() > 1) {
        ++font_counts_[font_id].n[CT_OK_MULTI_UNICHAR];
      }
    } else {
      // Count maps from unichar id to shape id.
      if (num_results > 0)
        ++unichar_counts_(unichar_id, results[0].shape_id);
      // This is a unichar error.
      ++font_counts_[font_id].n[CT_UNICHAR_TOP1_ERR];
      if (boosting_mode == CT_UNICHAR_TOP1_ERR) sample->set_is_error(true);
      if (res_index >= MIN(2, num_results)) {
        // It is also a 2nd choice unichar error.
        ++font_counts_[font_id].n[CT_UNICHAR_TOP2_ERR];
        if (boosting_mode == CT_UNICHAR_TOP2_ERR) sample->set_is_error(true);
      }
      if (res_index >= num_results) {
        // It is also a top-n choice unichar error.
        ++font_counts_[font_id].n[CT_UNICHAR_TOPN_ERR];
        if (boosting_mode == CT_UNICHAR_TOPN_ERR) sample->set_is_error(true);
        debug_it = debug;
      }
    }
  }
  // Compute mean number of return values and mean rank of correct answer.
  font_counts_[font_id].n[CT_NUM_RESULTS] += num_results;
  font_counts_[font_id].n[CT_RANK] += res_index;
  // If it was an error for boosting then sum the weight.
  if (sample->is_error()) {
    scaled_error_ += sample->weight();
  }
  if (debug_it) {
    tprintf("%d results for char %s font %d :",
            num_results, shape_table.unicharset().id_to_unichar(unichar_id),
            font_id);
    for (int i = 0; i < num_results; ++i) {
      tprintf(" %.3f/%.3f:%s",
              results[i].rating, results[i].font,
              shape_table.DebugStr(results[i].shape_id).string());
    }
    tprintf("\n");
    return true;
  }
  return false;
}
// Accumulates the errors from the classifier results on a single sample.
// Returns true if debug is true and a CT_UNICHAR_TOPN_ERR error occurred.
// boosting_mode selects the type of error to be used for boosting and the
// is_error_ member of sample is set according to whether the required type
// of error occurred. The font_table provides access to font properties
// for error counting and shape_table is used to understand the relationship
// between unichar_ids and shape_ids in the results
bool ErrorCounter::AccumulateErrors(bool debug, CountTypes boosting_mode,
                                    const FontInfoTable& font_table,
                                    const GenericVector<UnicharRating>& results,
                                    TrainingSample* sample) {
  int num_results = results.size();
  int answer_actual_rank = -1;
  int font_id = sample->font_id();
  int unichar_id = sample->class_id();
  sample->set_is_error(false);
  if (num_results == 0) {
    // Reject. We count rejects as a separate category, but still mark the
    // sample as an error in case any training module wants to use that to
    // improve the classifier.
    sample->set_is_error(true);
    ++font_counts_[font_id].n[CT_REJECT];
  } else {
    // Find rank of correct unichar answer, using rating_epsilon_ to allow
    // different answers to score as equal. (Ignoring the font.)
    int epsilon_rank = 0;
    int answer_epsilon_rank = -1;
    int num_top_answers = 0;
    double prev_rating = results[0].rating;
    bool joined = false;
    bool broken = false;
    int res_index = 0;
    while (res_index < num_results) {
      if (results[res_index].rating < prev_rating - rating_epsilon_) {
        ++epsilon_rank;
        prev_rating = results[res_index].rating;
      }
      if (results[res_index].unichar_id == unichar_id &&
          answer_epsilon_rank < 0) {
        answer_epsilon_rank = epsilon_rank;
        answer_actual_rank = res_index;
      }
      if (results[res_index].unichar_id == UNICHAR_JOINED &&
          unicharset_.has_special_codes())
        joined = true;
      else if (results[res_index].unichar_id == UNICHAR_BROKEN &&
               unicharset_.has_special_codes())
        broken = true;
      else if (epsilon_rank == 0)
        ++num_top_answers;
      ++res_index;
    }
    if (answer_actual_rank != 0) {
      // Correct result is not absolute top.
      ++font_counts_[font_id].n[CT_UNICHAR_TOPTOP_ERR];
      if (boosting_mode == CT_UNICHAR_TOPTOP_ERR) sample->set_is_error(true);
    }
    if (answer_epsilon_rank == 0) {
      ++font_counts_[font_id].n[CT_UNICHAR_TOP_OK];
      // Unichar OK, but count if multiple unichars.
      if (num_top_answers > 1) {
        ++font_counts_[font_id].n[CT_OK_MULTI_UNICHAR];
        ++multi_unichar_counts_[unichar_id];
      }
      // Check to see if any font in the top choice has attributes that match.
      // TODO(rays) It is easy to add counters for individual font attributes
      // here if we want them.
      if (font_table.SetContainsFontProperties(
          font_id, results[answer_actual_rank].fonts)) {
        // Font attributes were matched.
        // Check for multiple properties.
        if (font_table.SetContainsMultipleFontProperties(
            results[answer_actual_rank].fonts))
          ++font_counts_[font_id].n[CT_OK_MULTI_FONT];
      } else {
        // Font attributes weren't matched.
        ++font_counts_[font_id].n[CT_FONT_ATTR_ERR];
      }
    } else {
      // This is a top unichar error.
      ++font_counts_[font_id].n[CT_UNICHAR_TOP1_ERR];
      if (boosting_mode == CT_UNICHAR_TOP1_ERR) sample->set_is_error(true);
      // Count maps from unichar id to wrong unichar id.
      ++unichar_counts_(unichar_id, results[0].unichar_id);
      if (answer_epsilon_rank < 0 || answer_epsilon_rank >= 2) {
        // It is also a 2nd choice unichar error.
        ++font_counts_[font_id].n[CT_UNICHAR_TOP2_ERR];
        if (boosting_mode == CT_UNICHAR_TOP2_ERR) sample->set_is_error(true);
      }
      if (answer_epsilon_rank < 0) {
        // It is also a top-n choice unichar error.
        ++font_counts_[font_id].n[CT_UNICHAR_TOPN_ERR];
        if (boosting_mode == CT_UNICHAR_TOPN_ERR) sample->set_is_error(true);
        answer_epsilon_rank = epsilon_rank;
      }
    }
    // Compute mean number of return values and mean rank of correct answer.
    font_counts_[font_id].n[CT_NUM_RESULTS] += num_results;
    font_counts_[font_id].n[CT_RANK] += answer_epsilon_rank;
    if (joined)
      ++font_counts_[font_id].n[CT_OK_JOINED];
    if (broken)
      ++font_counts_[font_id].n[CT_OK_BROKEN];
  }
  // If it was an error for boosting then sum the weight.
  if (sample->is_error()) {
    scaled_error_ += sample->weight();
    if (debug) {
      tprintf("%d results for char %s font %d :",
              num_results, unicharset_.id_to_unichar(unichar_id),
              font_id);
      for (int i = 0; i < num_results; ++i) {
        tprintf(" %.3f : %s\n",
                results[i].rating,
                unicharset_.id_to_unichar(results[i].unichar_id));
      }
      return true;
    }
    int percent = 0;
    if (num_results > 0)
      percent = IntCastRounded(results[0].rating * 100);
    bad_score_hist_.add(percent, 1);
  } else {
    int percent = 0;
    if (answer_actual_rank >= 0)
      percent = IntCastRounded(results[answer_actual_rank].rating * 100);
    ok_score_hist_.add(percent, 1);
  }
  return false;
}