Ejemplo n.º 1
0
// Accumulates the errors from the classifier results on a single sample.
// Returns true if debug is true and a CT_UNICHAR_TOPN_ERR error occurred.
// boosting_mode selects the type of error to be used for boosting and the
// is_error_ member of sample is set according to whether the required type
// of error occurred. The font_table provides access to font properties
// for error counting and shape_table is used to understand the relationship
// between unichar_ids and shape_ids in the results
bool ErrorCounter::AccumulateErrors(bool debug, CountTypes boosting_mode,
                                    const UnicityTable<FontInfo>& font_table,
                                    const ShapeTable& shape_table,
                                    const GenericVector<ShapeRating>& results,
                                    TrainingSample* sample) {
  int num_results = results.size();
  int res_index = 0;
  bool debug_it = false;
  int font_id = sample->font_id();
  int unichar_id = sample->class_id();
  sample->set_is_error(false);
  if (num_results == 0) {
    // Reject. We count rejects as a separate category, but still mark the
    // sample as an error in case any training module wants to use that to
    // improve the classifier.
    sample->set_is_error(true);
    ++font_counts_[font_id].n[CT_REJECT];
  } else if (shape_table.GetShape(results[0].shape_id).
          ContainsUnicharAndFont(unichar_id, font_id)) {
    ++font_counts_[font_id].n[CT_SHAPE_TOP_CORRECT];
    // Unichar and font OK, but count if multiple unichars.
    if (shape_table.GetShape(results[0].shape_id).size() > 1)
      ++font_counts_[font_id].n[CT_OK_MULTI_UNICHAR];
  } else {
    // This is a top shape error.
    ++font_counts_[font_id].n[CT_SHAPE_TOP_ERR];
    // Check to see if any font in the top choice has attributes that match.
    bool attributes_match = false;
    uinT32 font_props = font_table.get(font_id).properties;
    const Shape& shape = shape_table.GetShape(results[0].shape_id);
    for (int c = 0; c < shape.size() && !attributes_match; ++c) {
      for (int f = 0; f < shape[c].font_ids.size(); ++f) {
        if (font_table.get(shape[c].font_ids[f]).properties == font_props) {
          attributes_match = true;
          break;
        }
      }
    }
    // TODO(rays) It is easy to add counters for individual font attributes
    // here if we want them.
    if (!attributes_match)
      ++font_counts_[font_id].n[CT_FONT_ATTR_ERR];
    if (boosting_mode == CT_SHAPE_TOP_ERR) sample->set_is_error(true);
    // Find rank of correct unichar answer. (Ignoring the font.)
    while (res_index < num_results &&
           !shape_table.GetShape(results[res_index].shape_id).
                ContainsUnichar(unichar_id)) {
      ++res_index;
    }
    if (res_index == 0) {
      // Unichar OK, but count if multiple unichars.
      if (shape_table.GetShape(results[res_index].shape_id).size() > 1) {
        ++font_counts_[font_id].n[CT_OK_MULTI_UNICHAR];
      }
    } else {
      // Count maps from unichar id to shape id.
      if (num_results > 0)
        ++unichar_counts_(unichar_id, results[0].shape_id);
      // This is a unichar error.
      ++font_counts_[font_id].n[CT_UNICHAR_TOP1_ERR];
      if (boosting_mode == CT_UNICHAR_TOP1_ERR) sample->set_is_error(true);
      if (res_index >= MIN(2, num_results)) {
        // It is also a 2nd choice unichar error.
        ++font_counts_[font_id].n[CT_UNICHAR_TOP2_ERR];
        if (boosting_mode == CT_UNICHAR_TOP2_ERR) sample->set_is_error(true);
      }
      if (res_index >= num_results) {
        // It is also a top-n choice unichar error.
        ++font_counts_[font_id].n[CT_UNICHAR_TOPN_ERR];
        if (boosting_mode == CT_UNICHAR_TOPN_ERR) sample->set_is_error(true);
        debug_it = debug;
      }
    }
  }
  // Compute mean number of return values and mean rank of correct answer.
  font_counts_[font_id].n[CT_NUM_RESULTS] += num_results;
  font_counts_[font_id].n[CT_RANK] += res_index;
  // If it was an error for boosting then sum the weight.
  if (sample->is_error()) {
    scaled_error_ += sample->weight();
  }
  if (debug_it) {
    tprintf("%d results for char %s font %d :",
            num_results, shape_table.unicharset().id_to_unichar(unichar_id),
            font_id);
    for (int i = 0; i < num_results; ++i) {
      tprintf(" %.3f/%.3f:%s",
              results[i].rating, results[i].font,
              shape_table.DebugStr(results[i].shape_id).string());
    }
    tprintf("\n");
    return true;
  }
  return false;
}