// Sets all the properties for this unicharset given a src unicharset with // everything set. The unicharsets don't have to be the same, and graphemes // are correctly accounted for. void UNICHARSET::PartialSetPropertiesFromOther(int start_index, const UNICHARSET& src) { for (int ch = start_index; ch < size_used; ++ch) { const char* utf8 = id_to_unichar(ch); UNICHAR_PROPERTIES properties; if (src.GetStrProperties(utf8, &properties)) { // Setup the script_id, other_case, and mirror properly. const char* script = src.get_script_from_script_id(properties.script_id); properties.script_id = add_script(script); const char* other_case = src.id_to_unichar(properties.other_case); if (contains_unichar(other_case)) { properties.other_case = unichar_to_id(other_case); } else { properties.other_case = ch; } const char* mirror_str = src.id_to_unichar(properties.mirror); if (contains_unichar(mirror_str)) { properties.mirror = unichar_to_id(mirror_str); } else { properties.mirror = ch; } unichars[ch].properties.CopyFrom(properties); set_normed_ids(ch); } else { tprintf("Failed to get properties for index %d = %s\n", ch, utf8); } } }
/** * print_ratings_info * * Send all the ratings out to the logfile. * * @param fp file to use * @param ratings list of results * @param current_unicharset unicharset that can be used * for id-to-unichar conversion */ void print_ratings_info(FILE *fp, BLOB_CHOICE_LIST *ratings, const UNICHARSET ¤t_unicharset) { inT32 index; // to list inT32 best_index; // to list FLOAT32 best_rat; // rating FLOAT32 best_cert; // certainty const char* first_char = NULL; // character FLOAT32 first_rat; // rating FLOAT32 first_cert; // certainty const char* sec_char = NULL; // character FLOAT32 sec_rat = 0.0f; // rating FLOAT32 sec_cert = 0.0f; // certainty BLOB_CHOICE_IT c_it = ratings; // iterator index = ratings->length(); if (index > 0) { first_char = current_unicharset.id_to_unichar(c_it.data()->unichar_id()); first_rat = c_it.data()->rating(); first_cert = -c_it.data()->certainty(); if (index > 1) { sec_char = current_unicharset.id_to_unichar( c_it.data_relative(1)->unichar_id()); sec_rat = c_it.data_relative(1)->rating(); sec_cert = -c_it.data_relative(1)->certainty(); } else { sec_char = NULL; sec_rat = -1; sec_cert = -1; } } else { first_char = NULL; first_rat = -1; first_cert = -1; } best_index = -1; best_rat = -1; best_cert = -1; for (index = 0, c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward(), index++) { if (strcmp(current_unicharset.id_to_unichar(c_it.data()->unichar_id()), blob_answer) == 0) { best_index = index; best_rat = c_it.data()->rating(); best_cert = -c_it.data()->certainty(); } } if (first_char != NULL && (*first_char == '\0' || *first_char == ' ')) first_char = NULL; if (sec_char != NULL && (*sec_char == '\0' || *sec_char == ' ')) sec_char = NULL; fprintf(matcher_fp, " " INT32FORMAT " " INT32FORMAT " %g %g %s %g %g %s %g %g\n", ratings->length(), best_index, best_rat, best_cert, first_char != NULL ? first_char : "~", first_rat, first_cert, sec_char != NULL ? sec_char : "~", sec_rat, sec_cert); }
// Print the best guesses out of the match rating matrix. void MATRIX::print(const UNICHARSET &unicharset) const { tprintf("Ratings Matrix (top 3 choices)\n"); int dim = dimension(); int band_width = bandwidth(); int row, col; for (col = 0; col < dim; ++col) { for (row = col; row < dim && row < col + band_width; ++row) { BLOB_CHOICE_LIST *rating = this->get(col, row); if (rating == NOT_CLASSIFIED) continue; BLOB_CHOICE_IT b_it(rating); tprintf("col=%d row=%d ", col, row); for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { tprintf("%s rat=%g cert=%g " , unicharset.id_to_unichar(b_it.data()->unichar_id()), b_it.data()->rating(), b_it.data()->certainty()); } tprintf("\n"); } tprintf("\n"); } tprintf("\n"); for (col = 0; col < dim; ++col) tprintf("\t%d", col); tprintf("\n"); for (row = 0; row < dim; ++row) { for (col = 0; col <= row; ++col) { if (col == 0) tprintf("%d\t", row); if (row >= col + band_width) { tprintf(" \t"); continue; } BLOB_CHOICE_LIST *rating = this->get(col, row); if (rating != NOT_CLASSIFIED) { BLOB_CHOICE_IT b_it(rating); int counter = 0; for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { tprintf("%s ", unicharset.id_to_unichar(b_it.data()->unichar_id())); ++counter; if (counter == 3) break; } tprintf("\t"); } else { tprintf(" \t"); } } tprintf("\n"); } }
// Print the best guesses out of the match rating matrix. void MATRIX::print(const UNICHARSET &unicharset) { tprintf("Ratings Matrix (top choices)\n"); int row, col; for (col = 0; col < this->dimension(); ++col) tprintf("\t%d", col); tprintf("\n"); for (row = 0; row < this->dimension(); ++row) { for (col = 0; col <= row; ++col) { if (col == 0) tprintf("%d\t", row); BLOB_CHOICE_LIST *rating = this->get(col, row); if (rating != NOT_CLASSIFIED) { BLOB_CHOICE_IT b_it(rating); int counter = 0; for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { tprintf("%s ", unicharset.id_to_unichar(b_it.data()->unichar_id())); ++counter; if (counter == 3) break; } tprintf("\t"); } else { tprintf(" \t"); } } tprintf("\n"); } }
// For each id in src, if it does not occur in this, add it, as in // SetPropertiesFromOther, otherwise expand the ranges, as in // ExpandRangesFromOther. void UNICHARSET::AppendOtherUnicharset(const UNICHARSET& src) { int initial_used = size_used; for (int ch = 0; ch < src.size_used; ++ch) { const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties; const char* utf8 = src.id_to_unichar(ch); if (strcmp(utf8, " ") != 0 && src_props.AnyRangeEmpty()) { // Only use fully valid entries. tprintf("Bad properties for index %d, char %s: " "%d,%d %d,%d %d,%d %d,%d %d,%d\n", ch, utf8, src_props.min_bottom, src_props.max_bottom, src_props.min_top, src_props.max_top, src_props.min_width, src_props.max_width, src_props.min_bearing, src_props.max_bearing, src_props.min_advance, src_props.max_advance); continue; } int id = size_used; if (contains_unichar(utf8)) { id = unichar_to_id(utf8); // Just expand current ranges. unichars[id].properties.ExpandRangesFrom(src_props); } else { unichar_insert(utf8); unichars[id].properties.SetRangesEmpty(); } } // Set properties, including mirror and other_case, WITHOUT reordering // the unicharset. PartialSetPropertiesFromOther(initial_used, src); }
/** * print_ratings_info * * Send all the ratings out to the logfile. * * @param fp file to use * @param ratings list of results * @param current_unicharset unicharset that can be used * for id-to-unichar conversion */ void print_ratings_info(FILE *fp, BLOB_CHOICE_LIST *ratings, const UNICHARSET ¤t_unicharset) { inT32 index; // to list const char* first_char = NULL; // character FLOAT32 first_rat; // rating FLOAT32 first_cert; // certainty const char* sec_char = NULL; // character FLOAT32 sec_rat = 0.0f; // rating FLOAT32 sec_cert = 0.0f; // certainty BLOB_CHOICE_IT c_it = ratings; // iterator index = ratings->length(); if (index > 0) { first_char = current_unicharset.id_to_unichar(c_it.data()->unichar_id()); first_rat = c_it.data()->rating(); first_cert = -c_it.data()->certainty(); if (index > 1) { sec_char = current_unicharset.id_to_unichar( c_it.data_relative(1)->unichar_id()); sec_rat = c_it.data_relative(1)->rating(); sec_cert = -c_it.data_relative(1)->certainty(); } else { sec_char = NULL; sec_rat = -1; sec_cert = -1; } } else { first_char = NULL; first_rat = -1; first_cert = -1; } if (first_char != NULL && (*first_char == '\0' || *first_char == ' ')) first_char = NULL; if (sec_char != NULL && (*sec_char == '\0' || *sec_char == ' ')) sec_char = NULL; tprintf(" " INT32FORMAT " %s %g %g %s %g %g\n", ratings->length(), first_char != NULL ? first_char : "~", first_rat, first_cert, sec_char != NULL ? sec_char : "~", sec_rat, sec_cert); }
// Makes this a copy of src. Clears this completely first, so the automatic // ids will not be present in this if not in src. Does NOT reorder the set! void UNICHARSET::CopyFrom(const UNICHARSET& src) { clear(); for (int ch = 0; ch < src.size_used; ++ch) { const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties; const char* utf8 = src.id_to_unichar(ch); unichar_insert(utf8); unichars[ch].properties.ExpandRangesFrom(src_props); } // Set properties, including mirror and other_case, WITHOUT reordering // the unicharset. PartialSetPropertiesFromOther(0, src); }
/** * string_and_lengths * * Populates the given word_str with unichars from unichar_ids and * and word_lengths_str with the corresponding unichar lengths. * Uses current_unicharset to make unichar id -> unichar conversions. */ void WERD_CHOICE::string_and_lengths(const UNICHARSET ¤t_unicharset, STRING *word_str, STRING *word_lengths_str) const { *word_str = ""; if (word_lengths_str != NULL) *word_lengths_str = ""; for (int i = 0; i < length_; ++i) { const char *ch = current_unicharset.id_to_unichar(unichar_ids_[i]); *word_str += ch; if (word_lengths_str != NULL) { *word_lengths_str += strlen(ch); } } }
// Helper prints the given set of blob choices. static void PrintPath(int length, const BLOB_CHOICE** blob_choices, const UNICHARSET& unicharset, const char *label, FILE *output_file) { float rating = 0.0f; float certainty = 0.0f; for (int i = 0; i < length; ++i) { const BLOB_CHOICE* blob_choice = blob_choices[i]; fprintf(output_file, "%s", unicharset.id_to_unichar(blob_choice->unichar_id())); rating += blob_choice->rating(); if (certainty > blob_choice->certainty()) certainty = blob_choice->certainty(); } fprintf(output_file, "\t%s\t%.4f\t%.4f\n", label, rating, certainty); }
// For each id in src, if it does not occur in this, add it, as in // SetPropertiesFromOther, otherwise expand the ranges, as in // ExpandRangesFromOther. void UNICHARSET::AppendOtherUnicharset(const UNICHARSET& src) { int initial_used = size_used; for (int ch = 0; ch < src.size_used; ++ch) { const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties; const char* utf8 = src.id_to_unichar(ch); int id = size_used; if (contains_unichar(utf8)) { id = unichar_to_id(utf8); // Just expand current ranges. unichars[id].properties.ExpandRangesFrom(src_props); } else { unichar_insert_backwards_compatible(utf8); unichars[id].properties.SetRangesEmpty(); } } // Set properties, including mirror and other_case, WITHOUT reordering // the unicharset. PartialSetPropertiesFromOther(initial_used, src); }
// Helper to set the properties for an input unicharset file, writes to the // output file. If an appropriate script unicharset can be found in the // script_dir directory, then the tops and bottoms are expanded using the // script unicharset. // If non-empty, xheight data for the fonts are written to the xheights_file. void SetPropertiesForInputFile(const string& script_dir, const string& input_unicharset_file, const string& output_unicharset_file, const string& output_xheights_file) { UNICHARSET unicharset; // Load the input unicharset unicharset.load_from_file(input_unicharset_file.c_str()); tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(), input_unicharset_file.c_str()); // Set unichar properties tprintf("Setting unichar properties\n"); SetupBasicProperties(true, false, &unicharset); string xheights_str; for (int s = 0; s < unicharset.get_script_table_size(); ++s) { // Load the unicharset for the script if available. string filename = script_dir + "/" + unicharset.get_script_from_script_id(s) + ".unicharset"; UNICHARSET script_set; if (script_set.load_from_file(filename.c_str())) { unicharset.SetPropertiesFromOther(script_set); } // Load the xheights for the script if available. filename = script_dir + "/" + unicharset.get_script_from_script_id(s) + ".xheights"; string script_heights; if (File::ReadFileToString(filename, &script_heights)) xheights_str += script_heights; } if (!output_xheights_file.empty()) File::WriteStringToFileOrDie(xheights_str, output_xheights_file); for (int c = SPECIAL_UNICHAR_CODES_COUNT; c < unicharset.size(); ++c) { if (unicharset.PropertiesIncomplete(c)) { tprintf("Warning: properties incomplete for index %d = %s\n", c, unicharset.id_to_unichar(c)); } } // Write the output unicharset tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str()); unicharset.save_to_file(output_unicharset_file.c_str()); }
/* static */ ScriptPos WERD_CHOICE::ScriptPositionOf(bool print_debug, const UNICHARSET& unicharset, const TBOX& blob_box, UNICHAR_ID unichar_id) { ScriptPos retval = tesseract::SP_NORMAL; int top = blob_box.top(); int bottom = blob_box.bottom(); int min_bottom, max_bottom, min_top, max_top; unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom, &min_top, &max_top); int sub_thresh_top = min_top - kMinSubscriptOffset; int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset; int sup_thresh_bot = max_bottom + kMinSuperscriptOffset; if (bottom <= kMaxDropCapBottom) { retval = tesseract::SP_DROPCAP; } else if (top < sub_thresh_top && bottom < sub_thresh_bot) { retval = tesseract::SP_SUBSCRIPT; } else if (bottom > sup_thresh_bot) { retval = tesseract::SP_SUPERSCRIPT; } if (print_debug) { const char *pos = ScriptPosToString(retval); tprintf("%s Character %s[bot:%d top: %d] " "bot_range[%d,%d] top_range[%d, %d] " "sub_thresh[bot:%d top:%d] sup_thresh_bot %d\n", pos, unicharset.id_to_unichar(unichar_id), bottom, top, min_bottom, max_bottom, min_top, max_top, sub_thresh_bot, sub_thresh_top, sup_thresh_bot); } return retval; }