char_32 *CubeUtils::ToUpper(const char_32 *str32, CharSet *char_set) { if (!char_set) { return NULL; } UNICHARSET *unicharset = char_set->InternalUnicharset(); int len = StrLen(str32); char_32 *upper = new char_32[len + 1]; if (!upper) return NULL; for (int i = 0; i < len; ++i) { char_32 ch = str32[i]; if (ch == INVALID_UNICHAR_ID) { delete[] upper; return NULL; } // convert lower-case characters to upper-case if (unicharset->get_islower(char_set->ClassID(ch))) { UNICHAR_ID uid_upper = unicharset->get_other_case(char_set->ClassID(ch)); const char_32 *str32_upper = char_set->ClassString(uid_upper); // expect upper-case version of character to be a single character if (!str32_upper || StrLen(str32_upper) != 1) { delete[] upper; return NULL; } upper[i] = str32_upper[0]; } else { upper[i] = ch; } } upper[len] = 0; return upper; }
// Helper to set the properties for an input unicharset file, writes to the // output file. If an appropriate script unicharset can be found in the // script_dir directory, then the tops and bottoms are expanded using the // script unicharset. // If non-empty, xheight data for the fonts are written to the xheights_file. void SetPropertiesForInputFile(const std::string& script_dir, const std::string& input_unicharset_file, const std::string& output_unicharset_file, const std::string& output_xheights_file) { UNICHARSET unicharset; // Load the input unicharset unicharset.load_from_file(input_unicharset_file.c_str()); tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(), input_unicharset_file.c_str()); // Set unichar properties tprintf("Setting unichar properties\n"); SetupBasicProperties(true, false, &unicharset); tprintf("Setting script properties\n"); SetScriptProperties(script_dir, &unicharset); if (!output_xheights_file.empty()) { std::string xheights_str = GetXheightString(script_dir, unicharset); File::WriteStringToFileOrDie(xheights_str, output_xheights_file); } // Write the output unicharset tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str()); unicharset.save_to_file(output_unicharset_file.c_str()); }
// Sets all the properties for this unicharset given a src unicharset with // everything set. The unicharsets don't have to be the same, and graphemes // are correctly accounted for. void UNICHARSET::PartialSetPropertiesFromOther(int start_index, const UNICHARSET& src) { for (int ch = start_index; ch < size_used; ++ch) { const char* utf8 = id_to_unichar(ch); UNICHAR_PROPERTIES properties; if (src.GetStrProperties(utf8, &properties)) { // Setup the script_id, other_case, and mirror properly. const char* script = src.get_script_from_script_id(properties.script_id); properties.script_id = add_script(script); const char* other_case = src.id_to_unichar(properties.other_case); if (contains_unichar(other_case)) { properties.other_case = unichar_to_id(other_case); } else { properties.other_case = ch; } const char* mirror_str = src.id_to_unichar(properties.mirror); if (contains_unichar(mirror_str)) { properties.mirror = unichar_to_id(mirror_str); } else { properties.mirror = ch; } unichars[ch].properties.CopyFrom(properties); set_normed_ids(ch); } else { tprintf("Failed to get properties for index %d = %s\n", ch, utf8); } } }
int main(int argc, char** argv) { int option; const char* output_directory = "."; STRING unicharset_file_name; // Special characters are now included by default. UNICHARSET unicharset; setlocale(LC_ALL, ""); // Print usage if (argc <= 1) { printf("Usage: %s [-D DIRECTORY] FILE...\n", argv[0]); exit(1); } // Parse arguments while ((option = tessopt(argc, argv, "D" )) != EOF) { switch (option) { case 'D': output_directory = tessoptarg; ++tessoptind; break; } } // Save file name unicharset_file_name = output_directory; unicharset_file_name += "/"; unicharset_file_name += kUnicharsetFileName; // Load box files for (; tessoptind < argc; ++tessoptind) { printf("Extracting unicharset from %s\n", argv[tessoptind]); FILE* box_file = fopen(argv[tessoptind], "rb"); if (box_file == NULL) { printf("Cannot open box file %s\n", argv[tessoptind]); return -1; } TBOX box; STRING unichar_string; int line_number = 0; while (ReadNextBox(&line_number, box_file, &unichar_string, &box)) { unicharset.unichar_insert(unichar_string.string()); set_properties(&unicharset, unichar_string.string()); } } // Write unicharset file if (unicharset.save_to_file(unicharset_file_name.string())) { printf("Wrote unicharset file %s.\n", unicharset_file_name.string()); } else { printf("Cannot save unicharset file %s.\n", unicharset_file_name.string()); return -1; } return 0; }
/** * print_ratings_info * * Send all the ratings out to the logfile. * * @param fp file to use * @param ratings list of results * @param current_unicharset unicharset that can be used * for id-to-unichar conversion */ void print_ratings_info(FILE *fp, BLOB_CHOICE_LIST *ratings, const UNICHARSET ¤t_unicharset) { inT32 index; // to list inT32 best_index; // to list FLOAT32 best_rat; // rating FLOAT32 best_cert; // certainty const char* first_char = NULL; // character FLOAT32 first_rat; // rating FLOAT32 first_cert; // certainty const char* sec_char = NULL; // character FLOAT32 sec_rat = 0.0f; // rating FLOAT32 sec_cert = 0.0f; // certainty BLOB_CHOICE_IT c_it = ratings; // iterator index = ratings->length(); if (index > 0) { first_char = current_unicharset.id_to_unichar(c_it.data()->unichar_id()); first_rat = c_it.data()->rating(); first_cert = -c_it.data()->certainty(); if (index > 1) { sec_char = current_unicharset.id_to_unichar( c_it.data_relative(1)->unichar_id()); sec_rat = c_it.data_relative(1)->rating(); sec_cert = -c_it.data_relative(1)->certainty(); } else { sec_char = NULL; sec_rat = -1; sec_cert = -1; } } else { first_char = NULL; first_rat = -1; first_cert = -1; } best_index = -1; best_rat = -1; best_cert = -1; for (index = 0, c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward(), index++) { if (strcmp(current_unicharset.id_to_unichar(c_it.data()->unichar_id()), blob_answer) == 0) { best_index = index; best_rat = c_it.data()->rating(); best_cert = -c_it.data()->certainty(); } } if (first_char != NULL && (*first_char == '\0' || *first_char == ' ')) first_char = NULL; if (sec_char != NULL && (*sec_char == '\0' || *sec_char == ' ')) sec_char = NULL; fprintf(matcher_fp, " " INT32FORMAT " " INT32FORMAT " %g %g %s %g %g %s %g %g\n", ratings->length(), best_index, best_rat, best_cert, first_char != NULL ? first_char : "~", first_rat, first_cert, sec_char != NULL ? sec_char : "~", sec_rat, sec_cert); }
// Constructor is private. Only anticipated use of ErrorCounter is via // the static ComputeErrorRate. ErrorCounter::ErrorCounter(const UNICHARSET& unicharset, int fontsize) : scaled_error_(0.0), rating_epsilon_(kRatingEpsilon), unichar_counts_(unicharset.size(), unicharset.size(), 0), ok_score_hist_(0, 101), bad_score_hist_(0, 101), unicharset_(unicharset) { Counts empty_counts; font_counts_.init_to_size(fontsize, empty_counts); multi_unichar_counts_.init_to_size(unicharset.size(), 0); }
// Helper adds all the scripts from sid_set converted to ids from osd_set to // allowed_ids. static void AddAllScriptsConverted(const UNICHARSET& sid_set, const UNICHARSET& osd_set, GenericVector<int>* allowed_ids) { for (int i = 0; i < sid_set.get_script_table_size(); ++i) { if (i != sid_set.null_sid()) { const char* script = sid_set.get_script_from_script_id(i); allowed_ids->push_back(osd_set.get_script_id_from_name(script)); } } }
bool Dict::absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset) { if (word.length() < kMinAbsoluteGarbageWordLength) return false; int num_alphanum = 0; for (int x = 0; x < word.length(); ++x) { num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) || unicharset.get_isdigit(word.unichar_id(x))); } return (static_cast<float>(num_alphanum) / static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac); }
// Helper gets the combined x-heights string. std::string GetXheightString(const std::string& script_dir, const UNICHARSET& unicharset) { std::string xheights_str; for (int s = 0; s < unicharset.get_script_table_size(); ++s) { // Load the xheights for the script if available. std::string filename = script_dir + "/" + unicharset.get_script_from_script_id(s) + ".xheights"; std::string script_heights; if (File::ReadFileToString(filename, &script_heights)) xheights_str += script_heights; } return xheights_str; }
// Print the best guesses out of the match rating matrix. void MATRIX::print(const UNICHARSET &unicharset) const { tprintf("Ratings Matrix (top 3 choices)\n"); int dim = dimension(); int band_width = bandwidth(); int row, col; for (col = 0; col < dim; ++col) { for (row = col; row < dim && row < col + band_width; ++row) { BLOB_CHOICE_LIST *rating = this->get(col, row); if (rating == NOT_CLASSIFIED) continue; BLOB_CHOICE_IT b_it(rating); tprintf("col=%d row=%d ", col, row); for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { tprintf("%s rat=%g cert=%g " , unicharset.id_to_unichar(b_it.data()->unichar_id()), b_it.data()->rating(), b_it.data()->certainty()); } tprintf("\n"); } tprintf("\n"); } tprintf("\n"); for (col = 0; col < dim; ++col) tprintf("\t%d", col); tprintf("\n"); for (row = 0; row < dim; ++row) { for (col = 0; col <= row; ++col) { if (col == 0) tprintf("%d\t", row); if (row >= col + band_width) { tprintf(" \t"); continue; } BLOB_CHOICE_LIST *rating = this->get(col, row); if (rating != NOT_CLASSIFIED) { BLOB_CHOICE_IT b_it(rating); int counter = 0; for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { tprintf("%s ", unicharset.id_to_unichar(b_it.data()->unichar_id())); ++counter; if (counter == 3) break; } tprintf("\t"); } else { tprintf(" \t"); } } tprintf("\n"); } }
/** * WERD_CHOICE::WERD_CHOICE * * Constructor to build a WERD_CHOICE from the given string. * The function assumes that src_string is not NULL. */ WERD_CHOICE::WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset) { STRING src_lengths; int len = strlen(src_string); const char *ptr = src_string; int step = unicharset.step(ptr); for (; ptr < src_string + len && step > 0; step = unicharset.step(ptr), src_lengths += step, ptr += step); if (step != 0 && ptr == src_string + len) { this->init(src_string, src_lengths.string(), 0.0, 0.0, NO_PERM, unicharset); } else { // there must have been an invalid unichar in the string this->init(8); this->make_bad(); } }
/** * WERD_CHOICE::init * * Helper function to build a WERD_CHOICE from the given string, * fragment lengths, rating, certainty and permuter. * * The function assumes that src_string is not NULL. * src_lengths argument could be NULL, in which case the unichars * in src_string are assumed to all be of length 1. */ void WERD_CHOICE::init(const char *src_string, const char *src_lengths, float src_rating, float src_certainty, uinT8 src_permuter, const UNICHARSET &unicharset) { int src_string_len = strlen(src_string); if (src_string_len == 0) { this->init(8); } else { this->init(src_lengths ? strlen(src_lengths): src_string_len); length_ = reserved_; int offset = 0; for (int i = 0; i < length_; ++i) { int unichar_length = src_lengths ? src_lengths[i] : 1; unichar_ids_[i] = unicharset.unichar_to_id(src_string+offset, unichar_length); fragment_lengths_[i] = 1; offset += unichar_length; } } rating_ = src_rating; certainty_ = src_certainty; permuter_ = src_permuter; }
// For each id in src, if it does not occur in this, add it, as in // SetPropertiesFromOther, otherwise expand the ranges, as in // ExpandRangesFromOther. void UNICHARSET::AppendOtherUnicharset(const UNICHARSET& src) { int initial_used = size_used; for (int ch = 0; ch < src.size_used; ++ch) { const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties; const char* utf8 = src.id_to_unichar(ch); if (strcmp(utf8, " ") != 0 && src_props.AnyRangeEmpty()) { // Only use fully valid entries. tprintf("Bad properties for index %d, char %s: " "%d,%d %d,%d %d,%d %d,%d %d,%d\n", ch, utf8, src_props.min_bottom, src_props.max_bottom, src_props.min_top, src_props.max_top, src_props.min_width, src_props.max_width, src_props.min_bearing, src_props.max_bearing, src_props.min_advance, src_props.max_advance); continue; } int id = size_used; if (contains_unichar(utf8)) { id = unichar_to_id(utf8); // Just expand current ranges. unichars[id].properties.ExpandRangesFrom(src_props); } else { unichar_insert(utf8); unichars[id].properties.SetRangesEmpty(); } } // Set properties, including mirror and other_case, WITHOUT reordering // the unicharset. PartialSetPropertiesFromOther(initial_used, src); }
// Print the best guesses out of the match rating matrix. void MATRIX::print(const UNICHARSET &unicharset) { tprintf("Ratings Matrix (top choices)\n"); int row, col; for (col = 0; col < this->dimension(); ++col) tprintf("\t%d", col); tprintf("\n"); for (row = 0; row < this->dimension(); ++row) { for (col = 0; col <= row; ++col) { if (col == 0) tprintf("%d\t", row); BLOB_CHOICE_LIST *rating = this->get(col, row); if (rating != NOT_CLASSIFIED) { BLOB_CHOICE_IT b_it(rating); int counter = 0; for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { tprintf("%s ", unicharset.id_to_unichar(b_it.data()->unichar_id())); ++counter; if (counter == 3) break; } tprintf("\t"); } else { tprintf(" \t"); } } tprintf("\n"); } }
/********************************************************************** * print_ratings_list * * Send all the ratings out to the logfile. **********************************************************************/ void print_ratings_list( const char *msg, // intro message BLOB_CHOICE_LIST *ratings, // list of results const UNICHARSET ¤t_unicharset // unicharset that can be used // for id-to-unichar conversion ) { if (ratings->length() == 0) { tprintf("%s:<none>\n", msg); return; } if (*msg != '\0') { tprintf("%s\n", msg); } BLOB_CHOICE_IT c_it; c_it.set_to_list(ratings); for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) { tprintf("r%.2f c%.2f : %d %s", c_it.data()->rating(), c_it.data()->certainty(), c_it.data()->unichar_id(), current_unicharset.debug_str(c_it.data()->unichar_id()).string()); if (!c_it.at_last()) { tprintf("\n"); } } tprintf("\n"); fflush(stdout); }
UNICHAR_ID wc_to_unichar_id(const UNICHARSET &unicharset, int wc) { UNICHAR uch(wc); char *unichar = uch.utf8_str(); UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar); delete[] unichar; return unichar_id; }
int Dawg::check_for_words(const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const { if (filename == nullptr) return 0; FILE *word_file; char string [CHARS_PER_LINE]; int misses = 0; UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard); word_file = fopen(filename, "r"); if (word_file == nullptr) { tprintf("Error: Could not open file %s\n", filename); ASSERT_HOST(word_file); } while (fgets (string, CHARS_PER_LINE, word_file) != nullptr) { chomp_string(string); // remove newline WERD_CHOICE word(string, unicharset); if (word.length() > 0 && !word.contains_unichar_id(INVALID_UNICHAR_ID)) { if (!match_words(&word, 0, 0, enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) { tprintf("Missing word: %s\n", string); ++misses; } } else { tprintf("Failed to create a valid word from %s\n", string); } } fclose (word_file); // Make sure the user sees this with fprintf instead of tprintf. if (debug_level_) tprintf("Number of lost words=%d\n", misses); return misses; }
int Dict::case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) { int state = 0; int x; for (x = 0; x < word.length(); ++x) { UNICHAR_ID ch_id = word.unichar_id(x); if (unicharset.get_isupper(ch_id)) state = case_state_table[state][1]; else if (unicharset.get_islower(ch_id)) state = case_state_table[state][2]; else if (unicharset.get_isdigit(ch_id)) state = case_state_table[state][3]; else state = case_state_table[state][0]; if (state == -1) return false; } return state != 5; // single lower is bad }
// Expands the tops and bottoms and widths for this unicharset given a // src unicharset with ranges in it. The unicharsets don't have to be the // same, and graphemes are correctly accounted for. void UNICHARSET::ExpandRangesFromOther(const UNICHARSET& src) { for (int ch = 0; ch < size_used; ++ch) { const char* utf8 = id_to_unichar(ch); UNICHAR_PROPERTIES properties; if (src.GetStrProperties(utf8, &properties)) { // Expand just the ranges from properties. unichars[ch].properties.ExpandRangesFrom(properties); } } }
/** * print_ratings_info * * Send all the ratings out to the logfile. * * @param fp file to use * @param ratings list of results * @param current_unicharset unicharset that can be used * for id-to-unichar conversion */ void print_ratings_info(FILE *fp, BLOB_CHOICE_LIST *ratings, const UNICHARSET ¤t_unicharset) { inT32 index; // to list const char* first_char = NULL; // character FLOAT32 first_rat; // rating FLOAT32 first_cert; // certainty const char* sec_char = NULL; // character FLOAT32 sec_rat = 0.0f; // rating FLOAT32 sec_cert = 0.0f; // certainty BLOB_CHOICE_IT c_it = ratings; // iterator index = ratings->length(); if (index > 0) { first_char = current_unicharset.id_to_unichar(c_it.data()->unichar_id()); first_rat = c_it.data()->rating(); first_cert = -c_it.data()->certainty(); if (index > 1) { sec_char = current_unicharset.id_to_unichar( c_it.data_relative(1)->unichar_id()); sec_rat = c_it.data_relative(1)->rating(); sec_cert = -c_it.data_relative(1)->certainty(); } else { sec_char = NULL; sec_rat = -1; sec_cert = -1; } } else { first_char = NULL; first_rat = -1; first_cert = -1; } if (first_char != NULL && (*first_char == '\0' || *first_char == ' ')) first_char = NULL; if (sec_char != NULL && (*sec_char == '\0' || *sec_char == ' ')) sec_char = NULL; tprintf(" " INT32FORMAT " %s %g %g %s %g %g\n", ratings->length(), first_char != NULL ? first_char : "~", first_rat, first_cert, sec_char != NULL ? sec_char : "~", sec_rat, sec_cert); }
// Makes this a copy of src. Clears this completely first, so the automatic // ids will not be present in this if not in src. Does NOT reorder the set! void UNICHARSET::CopyFrom(const UNICHARSET& src) { clear(); for (int ch = 0; ch < src.size_used; ++ch) { const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties; const char* utf8 = src.id_to_unichar(ch); unichar_insert(utf8); unichars[ch].properties.ExpandRangesFrom(src_props); } // Set properties, including mirror and other_case, WITHOUT reordering // the unicharset. PartialSetPropertiesFromOther(0, src); }
int main(int argc, char** argv) { // Sets properties on the input unicharset file, and writes: // rootdir/lang/lang.charset_size=ddd.txt // rootdir/lang/lang.traineddata // rootdir/lang/lang.unicharset // If the 3 word lists are provided, the dawgs are also added // to the traineddata file. // The output unicharset and charset_size files are just for // human readability. tesseract::CheckSharedLibraryVersion(); tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true); GenericVector<STRING> words, puncs, numbers; // If these reads fail, we get a warning message and an empty list of words. tesseract::ReadFile(FLAGS_words.c_str(), nullptr).split('\n', &words); tesseract::ReadFile(FLAGS_puncs.c_str(), nullptr).split('\n', &puncs); tesseract::ReadFile(FLAGS_numbers.c_str(), nullptr).split('\n', &numbers); // Load the input unicharset UNICHARSET unicharset; if (!unicharset.load_from_file(FLAGS_input_unicharset.c_str(), false)) { tprintf("Failed to load unicharset from %s\n", FLAGS_input_unicharset.c_str()); return 1; } tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(), FLAGS_input_unicharset.c_str()); // Set unichar properties tprintf("Setting unichar properties\n"); tesseract::SetupBasicProperties(/*report_errors*/ true, /*decompose (NFD)*/ false, &unicharset); tprintf("Setting script properties\n"); tesseract::SetScriptProperties(FLAGS_script_dir.c_str(), &unicharset); // Combine everything into a traineddata file. return tesseract::CombineLangModel( unicharset, FLAGS_script_dir.c_str(), FLAGS_version_str.c_str(), FLAGS_output_dir.c_str(), FLAGS_lang.c_str(), FLAGS_pass_through_recoder, words, puncs, numbers, FLAGS_lang_is_rtl, /*reader*/ nullptr, /*writer*/ nullptr); }
/** * string_and_lengths * * Populates the given word_str with unichars from unichar_ids and * and word_lengths_str with the corresponding unichar lengths. * Uses current_unicharset to make unichar id -> unichar conversions. */ void WERD_CHOICE::string_and_lengths(const UNICHARSET ¤t_unicharset, STRING *word_str, STRING *word_lengths_str) const { *word_str = ""; if (word_lengths_str != NULL) *word_lengths_str = ""; for (int i = 0; i < length_; ++i) { const char *ch = current_unicharset.id_to_unichar(unichar_ids_[i]); *word_str += ch; if (word_lengths_str != NULL) { *word_lengths_str += strlen(ch); } } }
bool Wordrec::ChoiceIsCorrect(const UNICHARSET &uni_set, const WERD_CHOICE *choice, const GenericVector<STRING> &truth_text) { if (choice == NULL) return false; int i; STRING truth_str; for (i = 0; i < truth_text.length(); ++i) truth_str += truth_text[i]; STRING normed_choice_str; for (i = 0; i < choice->length(); ++i) { normed_choice_str += uni_set.get_normed_unichar(choice->unichar_id(i)); } return (truth_str == normed_choice_str); }
/** * print_char_choices_list */ void print_char_choices_list(const char *msg, const BLOB_CHOICE_LIST_VECTOR &char_choices, const UNICHARSET ¤t_unicharset, BOOL8 detailed) { if (*msg != '\0') tprintf("%s\n", msg); for (int x = 0; x < char_choices.length(); ++x) { BLOB_CHOICE_IT c_it; c_it.set_to_list(char_choices.get(x)); tprintf("char[%d]: %s\n", x, current_unicharset.debug_str( c_it.data()->unichar_id()).string()); if (detailed) print_ratings_list(" ", char_choices.get(x), current_unicharset); } }
/** * WERD_CHOICE::WERD_CHOICE * * Constructor to build a WERD_CHOICE from the given string. * The function assumes that src_string is not NULL. */ WERD_CHOICE::WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset) : unicharset_(&unicharset){ GenericVector<UNICHAR_ID> encoding; GenericVector<char> lengths; if (unicharset.encode_string(src_string, true, &encoding, &lengths, NULL)) { lengths.push_back('\0'); STRING src_lengths = &lengths[0]; this->init(src_string, src_lengths.string(), 0.0, 0.0, NO_PERM); } else { // There must have been an invalid unichar in the string. this->init(8); this->make_bad(); } }
/* static */ ScriptPos WERD_CHOICE::ScriptPositionOf(bool print_debug, const UNICHARSET& unicharset, const TBOX& blob_box, UNICHAR_ID unichar_id) { ScriptPos retval = tesseract::SP_NORMAL; int top = blob_box.top(); int bottom = blob_box.bottom(); int min_bottom, max_bottom, min_top, max_top; unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom, &min_top, &max_top); int sub_thresh_top = min_top - kMinSubscriptOffset; int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset; int sup_thresh_bot = max_bottom + kMinSuperscriptOffset; if (bottom <= kMaxDropCapBottom) { retval = tesseract::SP_DROPCAP; } else if (top < sub_thresh_top && bottom < sub_thresh_bot) { retval = tesseract::SP_SUBSCRIPT; } else if (bottom > sup_thresh_bot) { retval = tesseract::SP_SUPERSCRIPT; } if (print_debug) { const char *pos = ScriptPosToString(retval); tprintf("%s Character %s[bot:%d top: %d] " "bot_range[%d,%d] top_range[%d, %d] " "sub_thresh[bot:%d top:%d] sup_thresh_bot %d\n", pos, unicharset.id_to_unichar(unichar_id), bottom, top, min_bottom, max_bottom, min_top, max_top, sub_thresh_bot, sub_thresh_top, sup_thresh_bot); } return retval; }
int main(int argc, char *argv[]) { if (argc != 4) { tprintf("Print all the words in a given dawg.\n"); tprintf("Usage: %s <unicharset> <dawgfile> <wordlistfile>\n", argv[0]); return 1; } const char *unicharset_file = argv[1]; const char *dawg_file = argv[2]; const char *wordlist_file = argv[3]; UNICHARSET unicharset; if (!unicharset.load_from_file(unicharset_file)) { tprintf("Error loading unicharset from %s.\n", unicharset_file); return 1; } tesseract::Dawg *dict = LoadSquishedDawg(unicharset, dawg_file); if (dict == NULL) { tprintf("Error loading dictionary from %s.\n", dawg_file); return 1; } int retval = WriteDawgAsWordlist(unicharset, dict, wordlist_file); delete dict; return retval; }
// Helper prints the given set of blob choices. static void PrintPath(int length, const BLOB_CHOICE** blob_choices, const UNICHARSET& unicharset, const char *label, FILE *output_file) { float rating = 0.0f; float certainty = 0.0f; for (int i = 0; i < length; ++i) { const BLOB_CHOICE* blob_choice = blob_choices[i]; fprintf(output_file, "%s", unicharset.id_to_unichar(blob_choice->unichar_id())); rating += blob_choice->rating(); if (certainty > blob_choice->certainty()) certainty = blob_choice->certainty(); } fprintf(output_file, "\t%s\t%.4f\t%.4f\n", label, rating, certainty); }
bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set) { bool all_one_case = true; bool capitalized; bool prev_upper; bool prev_lower; bool first_upper; bool first_lower; bool cur_upper; bool cur_lower; string str8; if (!char_set) { // If cube char_set is missing, use C-locale-dependent functions // on UTF8 characters to determine case properties. first_upper = isupper(str32[0]); first_lower = islower(str32[0]); if (first_upper) capitalized = true; prev_upper = first_upper; prev_lower = islower(str32[0]); for (int c = 1; str32[c] != 0; ++c) { cur_upper = isupper(str32[c]); cur_lower = islower(str32[c]); if ((prev_upper && cur_lower) || (prev_lower && cur_upper)) all_one_case = false; if (cur_upper) capitalized = false; prev_upper = cur_upper; prev_lower = cur_lower; } } else { UNICHARSET *unicharset = char_set->InternalUnicharset(); // Use UNICHARSET functions to determine case properties first_upper = unicharset->get_isupper(char_set->ClassID(str32[0])); first_lower = unicharset->get_islower(char_set->ClassID(str32[0])); if (first_upper) capitalized = true; prev_upper = first_upper; prev_lower = unicharset->get_islower(char_set->ClassID(str32[0])); for (int c = 1; c < StrLen(str32); ++c) { cur_upper = unicharset->get_isupper(char_set->ClassID(str32[c])); cur_lower = unicharset->get_islower(char_set->ClassID(str32[c])); if ((prev_upper && cur_lower) || (prev_lower && cur_upper)) all_one_case = false; if (cur_upper) capitalized = false; prev_upper = cur_upper; prev_lower = cur_lower; } } return all_one_case || capitalized; }