예제 #1
0
// Helper function to get the index of the first result with the required
// unichar_id. If the results are sorted by rating, this will also be the
// best result with the required unichar_id.
// Returns -1 if the unichar_id is not found
int ShapeRating::FirstResultWithUnichar(
    const GenericVector<ShapeRating>& results,
    const ShapeTable& shape_table,
    UNICHAR_ID unichar_id) {
  for (int r = 0; r < results.size(); ++r) {
    int shape_id = results[r].shape_id;
    const Shape& shape = shape_table.GetShape(shape_id);
    if (shape.ContainsUnichar(unichar_id)) {
      return r;
    }
  }
  return -1;
}
예제 #2
0
// Accumulates counts for junk. Counts only whether the junk was correctly
// rejected or not.
void ErrorCounter::AccumulateJunk(const ShapeTable& shape_table,
                                  const GenericVector<ShapeRating>& results,
                                  TrainingSample* sample) {
  // For junk we accept no answer, or an explicit shape answer matching the
  // class id of the sample.
  int num_results = results.size();
  int font_id = sample->font_id();
  int unichar_id = sample->class_id();
  if (num_results > 0 &&
      !shape_table.GetShape(results[0].shape_id).ContainsUnichar(unichar_id)) {
    // This is a junk error.
    ++font_counts_[font_id].n[CT_ACCEPTED_JUNK];
    sample->set_is_error(true);
    // It counts as an error for boosting too so sum the weight.
    scaled_error_ += sample->weight();
  } else {
    // Correctly rejected.
    ++font_counts_[font_id].n[CT_REJECTED_JUNK];
    sample->set_is_error(false);
  }
}
예제 #3
0
// Expands all the classes/fonts in the shape individually to build
// a ShapeTable.
int ShapeTable::BuildFromShape(const Shape& shape,
                               const ShapeTable& master_shapes) {
  int num_masters = 0;
  for (int u_ind = 0; u_ind < shape.size(); ++u_ind) {
    for (int f_ind = 0; f_ind < shape[u_ind].font_ids.size(); ++f_ind) {
      int c = shape[u_ind].unichar_id;
      int f = shape[u_ind].font_ids[f_ind];
      if (FindShape(c, f) < 0) {
        int shape_id = AddShape(c, f);
        int master_id = master_shapes.FindShape(c, f);
        if (master_id >= 0 && shape.size() > 1) {
          const Shape& master = master_shapes.GetShape(master_id);
          if (master.IsSubsetOf(shape) && !shape.IsSubsetOf(master)) {
            // Add everything else from the master shape.
            shape_table_[shape_id]->AddShape(master);
            ++num_masters;
          }
        }
      }
    }
  }
  return num_masters;
}
예제 #4
0
// Expands all the classes/fonts in the shape individually to build
// a ShapeTable.
int ShapeTable::BuildFromShape(const Shape& shape,
                               const ShapeTable& master_shapes) {
  BitVector shape_map(master_shapes.NumShapes());
  for (int u_ind = 0; u_ind < shape.size(); ++u_ind) {
    for (int f_ind = 0; f_ind < shape[u_ind].font_ids.size(); ++f_ind) {
      int c = shape[u_ind].unichar_id;
      int f = shape[u_ind].font_ids[f_ind];
      int master_id = master_shapes.FindShape(c, f);
      if (master_id >= 0) {
        shape_map.SetBit(master_id);
      } else if (FindShape(c, f) < 0) {
        AddShape(c, f);
      }
    }
  }
  int num_masters = 0;
  for (int s = 0; s < master_shapes.NumShapes(); ++s) {
    if (shape_map[s]) {
      AddShape(master_shapes.GetShape(s));
      ++num_masters;
    }
  }
  return num_masters;
}
예제 #5
0
// Accumulates the errors from the classifier results on a single sample.
// Returns true if debug is true and a CT_UNICHAR_TOPN_ERR error occurred.
// boosting_mode selects the type of error to be used for boosting and the
// is_error_ member of sample is set according to whether the required type
// of error occurred. The font_table provides access to font properties
// for error counting and shape_table is used to understand the relationship
// between unichar_ids and shape_ids in the results
bool ErrorCounter::AccumulateErrors(bool debug, CountTypes boosting_mode,
                                    const UnicityTable<FontInfo>& font_table,
                                    const ShapeTable& shape_table,
                                    const GenericVector<ShapeRating>& results,
                                    TrainingSample* sample) {
  int num_results = results.size();
  int res_index = 0;
  bool debug_it = false;
  int font_id = sample->font_id();
  int unichar_id = sample->class_id();
  sample->set_is_error(false);
  if (num_results == 0) {
    // Reject. We count rejects as a separate category, but still mark the
    // sample as an error in case any training module wants to use that to
    // improve the classifier.
    sample->set_is_error(true);
    ++font_counts_[font_id].n[CT_REJECT];
  } else if (shape_table.GetShape(results[0].shape_id).
          ContainsUnicharAndFont(unichar_id, font_id)) {
    ++font_counts_[font_id].n[CT_SHAPE_TOP_CORRECT];
    // Unichar and font OK, but count if multiple unichars.
    if (shape_table.GetShape(results[0].shape_id).size() > 1)
      ++font_counts_[font_id].n[CT_OK_MULTI_UNICHAR];
  } else {
    // This is a top shape error.
    ++font_counts_[font_id].n[CT_SHAPE_TOP_ERR];
    // Check to see if any font in the top choice has attributes that match.
    bool attributes_match = false;
    uinT32 font_props = font_table.get(font_id).properties;
    const Shape& shape = shape_table.GetShape(results[0].shape_id);
    for (int c = 0; c < shape.size() && !attributes_match; ++c) {
      for (int f = 0; f < shape[c].font_ids.size(); ++f) {
        if (font_table.get(shape[c].font_ids[f]).properties == font_props) {
          attributes_match = true;
          break;
        }
      }
    }
    // TODO(rays) It is easy to add counters for individual font attributes
    // here if we want them.
    if (!attributes_match)
      ++font_counts_[font_id].n[CT_FONT_ATTR_ERR];
    if (boosting_mode == CT_SHAPE_TOP_ERR) sample->set_is_error(true);
    // Find rank of correct unichar answer. (Ignoring the font.)
    while (res_index < num_results &&
           !shape_table.GetShape(results[res_index].shape_id).
                ContainsUnichar(unichar_id)) {
      ++res_index;
    }
    if (res_index == 0) {
      // Unichar OK, but count if multiple unichars.
      if (shape_table.GetShape(results[res_index].shape_id).size() > 1) {
        ++font_counts_[font_id].n[CT_OK_MULTI_UNICHAR];
      }
    } else {
      // Count maps from unichar id to shape id.
      if (num_results > 0)
        ++unichar_counts_(unichar_id, results[0].shape_id);
      // This is a unichar error.
      ++font_counts_[font_id].n[CT_UNICHAR_TOP1_ERR];
      if (boosting_mode == CT_UNICHAR_TOP1_ERR) sample->set_is_error(true);
      if (res_index >= MIN(2, num_results)) {
        // It is also a 2nd choice unichar error.
        ++font_counts_[font_id].n[CT_UNICHAR_TOP2_ERR];
        if (boosting_mode == CT_UNICHAR_TOP2_ERR) sample->set_is_error(true);
      }
      if (res_index >= num_results) {
        // It is also a top-n choice unichar error.
        ++font_counts_[font_id].n[CT_UNICHAR_TOPN_ERR];
        if (boosting_mode == CT_UNICHAR_TOPN_ERR) sample->set_is_error(true);
        debug_it = debug;
      }
    }
  }
  // Compute mean number of return values and mean rank of correct answer.
  font_counts_[font_id].n[CT_NUM_RESULTS] += num_results;
  font_counts_[font_id].n[CT_RANK] += res_index;
  // If it was an error for boosting then sum the weight.
  if (sample->is_error()) {
    scaled_error_ += sample->weight();
  }
  if (debug_it) {
    tprintf("%d results for char %s font %d :",
            num_results, shape_table.unicharset().id_to_unichar(unichar_id),
            font_id);
    for (int i = 0; i < num_results; ++i) {
      tprintf(" %.3f/%.3f:%s",
              results[i].rating, results[i].font,
              shape_table.DebugStr(results[i].shape_id).string());
    }
    tprintf("\n");
    return true;
  }
  return false;
}