// Appends the master shapes from other to this. // If not NULL, shape_map is set to map other shape_ids to this's shape_ids. void ShapeTable::AppendMasterShapes(const ShapeTable& other, GenericVector<int>* shape_map) { if (shape_map != NULL) shape_map->init_to_size(other.NumShapes(), -1); for (int s = 0; s < other.shape_table_.size(); ++s) { if (other.shape_table_[s]->destination_index() < 0) { int index = AddShape(*other.shape_table_[s]); if (shape_map != NULL) (*shape_map)[s] = index; } } }
// Helper function to get the index of the first result with the required // unichar_id. If the results are sorted by rating, this will also be the // best result with the required unichar_id. // Returns -1 if the unichar_id is not found int ShapeRating::FirstResultWithUnichar( const GenericVector<ShapeRating>& results, const ShapeTable& shape_table, UNICHAR_ID unichar_id) { for (int r = 0; r < results.size(); ++r) { int shape_id = results[r].shape_id; const Shape& shape = shape_table.GetShape(shape_id); if (shape.ContainsUnichar(unichar_id)) { return r; } } return -1; }
// Expands all the classes/fonts in the shape individually to build // a ShapeTable. int ShapeTable::BuildFromShape(const Shape& shape, const ShapeTable& master_shapes) { int num_masters = 0; for (int u_ind = 0; u_ind < shape.size(); ++u_ind) { for (int f_ind = 0; f_ind < shape[u_ind].font_ids.size(); ++f_ind) { int c = shape[u_ind].unichar_id; int f = shape[u_ind].font_ids[f_ind]; if (FindShape(c, f) < 0) { int shape_id = AddShape(c, f); int master_id = master_shapes.FindShape(c, f); if (master_id >= 0 && shape.size() > 1) { const Shape& master = master_shapes.GetShape(master_id); if (master.IsSubsetOf(shape) && !shape.IsSubsetOf(master)) { // Add everything else from the master shape. shape_table_[shape_id]->AddShape(master); ++num_masters; } } } } } return num_masters; }
// Helper to write the shape_table. void WriteShapeTable(const STRING& file_prefix, const ShapeTable& shape_table) { STRING shape_table_file = file_prefix; shape_table_file += kShapeTableFileSuffix; FILE* fp = fopen(shape_table_file.string(), "wb"); if (fp != nullptr) { if (!shape_table.Serialize(fp)) { fprintf(stderr, "Error writing shape table: %s\n", shape_table_file.string()); } fclose(fp); } else { fprintf(stderr, "Error creating shape table: %s\n", shape_table_file.string()); } }
// Expands all the classes/fonts in the shape individually to build // a ShapeTable. int ShapeTable::BuildFromShape(const Shape& shape, const ShapeTable& master_shapes) { BitVector shape_map(master_shapes.NumShapes()); for (int u_ind = 0; u_ind < shape.size(); ++u_ind) { for (int f_ind = 0; f_ind < shape[u_ind].font_ids.size(); ++f_ind) { int c = shape[u_ind].unichar_id; int f = shape[u_ind].font_ids[f_ind]; int master_id = master_shapes.FindShape(c, f); if (master_id >= 0) { shape_map.SetBit(master_id); } else if (FindShape(c, f) < 0) { AddShape(c, f); } } } int num_masters = 0; for (int s = 0; s < master_shapes.NumShapes(); ++s) { if (shape_map[s]) { AddShape(master_shapes.GetShape(s)); ++num_masters; } } return num_masters; }
// Accumulates counts for junk. Counts only whether the junk was correctly // rejected or not. void ErrorCounter::AccumulateJunk(const ShapeTable& shape_table, const GenericVector<ShapeRating>& results, TrainingSample* sample) { // For junk we accept no answer, or an explicit shape answer matching the // class id of the sample. int num_results = results.size(); int font_id = sample->font_id(); int unichar_id = sample->class_id(); if (num_results > 0 && !shape_table.GetShape(results[0].shape_id).ContainsUnichar(unichar_id)) { // This is a junk error. ++font_counts_[font_id].n[CT_ACCEPTED_JUNK]; sample->set_is_error(true); // It counts as an error for boosting too so sum the weight. scaled_error_ += sample->weight(); } else { // Correctly rejected. ++font_counts_[font_id].n[CT_REJECTED_JUNK]; sample->set_is_error(false); } }
// Accumulates the errors from the classifier results on a single sample. // Returns true if debug is true and a CT_UNICHAR_TOPN_ERR error occurred. // boosting_mode selects the type of error to be used for boosting and the // is_error_ member of sample is set according to whether the required type // of error occurred. The font_table provides access to font properties // for error counting and shape_table is used to understand the relationship // between unichar_ids and shape_ids in the results bool ErrorCounter::AccumulateErrors(bool debug, CountTypes boosting_mode, const UnicityTable<FontInfo>& font_table, const ShapeTable& shape_table, const GenericVector<ShapeRating>& results, TrainingSample* sample) { int num_results = results.size(); int res_index = 0; bool debug_it = false; int font_id = sample->font_id(); int unichar_id = sample->class_id(); sample->set_is_error(false); if (num_results == 0) { // Reject. We count rejects as a separate category, but still mark the // sample as an error in case any training module wants to use that to // improve the classifier. sample->set_is_error(true); ++font_counts_[font_id].n[CT_REJECT]; } else if (shape_table.GetShape(results[0].shape_id). ContainsUnicharAndFont(unichar_id, font_id)) { ++font_counts_[font_id].n[CT_SHAPE_TOP_CORRECT]; // Unichar and font OK, but count if multiple unichars. if (shape_table.GetShape(results[0].shape_id).size() > 1) ++font_counts_[font_id].n[CT_OK_MULTI_UNICHAR]; } else { // This is a top shape error. ++font_counts_[font_id].n[CT_SHAPE_TOP_ERR]; // Check to see if any font in the top choice has attributes that match. bool attributes_match = false; uinT32 font_props = font_table.get(font_id).properties; const Shape& shape = shape_table.GetShape(results[0].shape_id); for (int c = 0; c < shape.size() && !attributes_match; ++c) { for (int f = 0; f < shape[c].font_ids.size(); ++f) { if (font_table.get(shape[c].font_ids[f]).properties == font_props) { attributes_match = true; break; } } } // TODO(rays) It is easy to add counters for individual font attributes // here if we want them. if (!attributes_match) ++font_counts_[font_id].n[CT_FONT_ATTR_ERR]; if (boosting_mode == CT_SHAPE_TOP_ERR) sample->set_is_error(true); // Find rank of correct unichar answer. (Ignoring the font.) while (res_index < num_results && !shape_table.GetShape(results[res_index].shape_id). ContainsUnichar(unichar_id)) { ++res_index; } if (res_index == 0) { // Unichar OK, but count if multiple unichars. if (shape_table.GetShape(results[res_index].shape_id).size() > 1) { ++font_counts_[font_id].n[CT_OK_MULTI_UNICHAR]; } } else { // Count maps from unichar id to shape id. if (num_results > 0) ++unichar_counts_(unichar_id, results[0].shape_id); // This is a unichar error. ++font_counts_[font_id].n[CT_UNICHAR_TOP1_ERR]; if (boosting_mode == CT_UNICHAR_TOP1_ERR) sample->set_is_error(true); if (res_index >= MIN(2, num_results)) { // It is also a 2nd choice unichar error. ++font_counts_[font_id].n[CT_UNICHAR_TOP2_ERR]; if (boosting_mode == CT_UNICHAR_TOP2_ERR) sample->set_is_error(true); } if (res_index >= num_results) { // It is also a top-n choice unichar error. ++font_counts_[font_id].n[CT_UNICHAR_TOPN_ERR]; if (boosting_mode == CT_UNICHAR_TOPN_ERR) sample->set_is_error(true); debug_it = debug; } } } // Compute mean number of return values and mean rank of correct answer. font_counts_[font_id].n[CT_NUM_RESULTS] += num_results; font_counts_[font_id].n[CT_RANK] += res_index; // If it was an error for boosting then sum the weight. if (sample->is_error()) { scaled_error_ += sample->weight(); } if (debug_it) { tprintf("%d results for char %s font %d :", num_results, shape_table.unicharset().id_to_unichar(unichar_id), font_id); for (int i = 0; i < num_results; ++i) { tprintf(" %.3f/%.3f:%s", results[i].rating, results[i].font, shape_table.DebugStr(results[i].shape_id).string()); } tprintf("\n"); return true; } return false; }