static int io_load_file(struct io *io, const char *separators, size_t *lineno, io_read_fn read_property, void *data) { struct buffer buf; int state = OK; while (state == OK && io_get_line(io, &buf, '\n', lineno, TRUE)) { char *name; char *value; size_t namelen; size_t valuelen; name = chomp_string(buf.data); namelen = strcspn(name, separators); if (name[namelen]) { name[namelen] = 0; value = chomp_string(name + namelen + 1); valuelen = strlen(value); } else { value = ""; valuelen = 0; } state = read_property(name, namelen, value, valuelen, data); } if (state != ERR && io_error(io)) state = ERR; io_done(io); return state; }
void parse_author_line(char *ident, const struct ident **author, struct time *time) { char *nameend = strchr(ident, '<'); char *emailend = strchr(ident, '>'); const char *name, *email = ""; if (nameend && emailend) *nameend = *emailend = 0; name = chomp_string(ident); if (nameend) email = chomp_string(nameend + 1); if (!*name) name = *email ? email : unknown_ident.name; if (!*email) email = *name ? name : unknown_ident.email; *author = get_author(name, email); /* Parse epoch and timezone */ if (time && emailend && emailend[1] == ' ') { char *secs = emailend + 2; char *zone = strchr(secs, ' '); parse_timesec(time, secs); if (zone && strlen(zone) == STRING_SIZE(" +0700")) parse_timezone(time, zone + 1); } }
bool ParamUtils::ReadParamsFromFp(FILE *fp, inT64 end_offset, SetParamConstraint constraint, ParamsVectors *member_params) { char line[MAX_PATH]; // input line bool anyerr = false; // true if any error bool foundit; // found parameter inT16 length; // length of line char *valptr; // value field while ((end_offset < 0 || ftell(fp) < end_offset) && fgets(line, MAX_PATH, fp)) { if (line[0] != '\n' && line[0] != '#') { length = strlen (line); chomp_string(line); // remove newline for (valptr = line; *valptr && *valptr != ' ' && *valptr != '\t'; valptr++); if (*valptr) { // found blank *valptr = '\0'; // make name a string do valptr++; // find end of blanks while (*valptr == ' ' || *valptr == '\t'); } foundit = SetParam(line, valptr, constraint, member_params); if (!foundit) { anyerr = true; // had an error tprintf("read_params_file: parameter not found: %s\n", line); exit(1); } } } return anyerr; }
bool ParamUtils::ReadParamsFromFp(SetParamConstraint constraint, TFile *fp, ParamsVectors *member_params) { char line[MAX_PATH]; // input line bool anyerr = false; // true if any error bool foundit; // found parameter char *valptr; // value field while (fp->FGets(line, MAX_PATH) != nullptr) { if (line[0] != '\r' && line[0] != '\n' && line[0] != '#') { chomp_string(line); // remove newline for (valptr = line; *valptr && *valptr != ' ' && *valptr != '\t'; valptr++); if (*valptr) { // found blank *valptr = '\0'; // make name a string do valptr++; // find end of blanks while (*valptr == ' ' || *valptr == '\t'); } foundit = SetParam(line, valptr, constraint, member_params); if (!foundit) { anyerr = true; // had an error tprintf("Warning: Parameter not found: %s\n", line); } } } return anyerr; }
int Dawg::check_for_words(const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const { if (filename == nullptr) return 0; FILE *word_file; char string [CHARS_PER_LINE]; int misses = 0; UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard); word_file = fopen(filename, "r"); if (word_file == nullptr) { tprintf("Error: Could not open file %s\n", filename); ASSERT_HOST(word_file); } while (fgets (string, CHARS_PER_LINE, word_file) != nullptr) { chomp_string(string); // remove newline WERD_CHOICE word(string, unicharset); if (word.length() > 0 && !word.contains_unichar_id(INVALID_UNICHAR_ID)) { if (!match_words(&word, 0, 0, enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) { tprintf("Missing word: %s\n", string); ++misses; } } else { tprintf("Failed to create a valid word from %s\n", string); } } fclose (word_file); // Make sure the user sees this with fprintf instead of tprintf. if (debug_level_) tprintf("Number of lost words=%d\n", misses); return misses; }
// Parses the given box file string into a page_number, utf8_str, and // bounding_box. Returns true on a successful parse. // The box file is assumed to contain box definitions, one per line, of the // following format for blob-level boxes: // <UTF8 str> <left> <bottom> <right> <top> <page id> // and for word/line-level boxes: // WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str> // See applyybox.cpp for more information. bool ParseBoxFileStr(const char* boxfile_str, int* page_number, STRING* utf8_str, TBOX* bounding_box) { *bounding_box = TBOX(); // Initialize it to empty. *utf8_str = ""; char uch[kBoxReadBufSize]; const char *buffptr = boxfile_str; // Read the unichar without messing up on Tibetan. // According to issue 253 the utf-8 surrogates 85 and A0 are treated // as whitespace by sscanf, so it is more reliable to just find // ascii space and tab. int uch_len = 0; // Skip unicode file designation, if present. const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr); if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf) buffptr += 3; // Allow a single blank as the UTF-8 string. Check for empty string and // then blindly eat the first character. if (*buffptr == '\0') return false; do { uch[uch_len++] = *buffptr++; } while (*buffptr != '\0' && *buffptr != ' ' && *buffptr != '\t' && uch_len < kBoxReadBufSize - 1); uch[uch_len] = '\0'; if (*buffptr != '\0') ++buffptr; int x_min, y_min, x_max, y_max; *page_number = 0; int count = sscanf(buffptr, "%d %d %d %d %d", &x_min, &y_min, &x_max, &y_max, page_number); if (count != 5 && count != 4) { tprintf("Bad box coordinates in boxfile string! %s\n", ubuf); return false; } // Test for long space-delimited string label. if (strcmp(uch, kMultiBlobLabelCode) == 0 && (buffptr = strchr(buffptr, '#')) != NULL) { strncpy(uch, buffptr + 1, kBoxReadBufSize - 1); uch[kBoxReadBufSize - 1] = '\0'; // Prevent buffer overrun. chomp_string(uch); uch_len = strlen(uch); } // Validate UTF8 by making unichars with it. int used = 0; while (used < uch_len) { UNICHAR ch(uch + used, uch_len - used); int new_used = ch.utf8_len(); if (new_used == 0) { tprintf("Bad UTF-8 str %s starts with 0x%02x at col %d\n", uch + used, uch[used], used + 1); return false; } used += new_used; } *utf8_str = uch; if (x_min > x_max) Swap(&x_min, &x_max); if (y_min > y_max) Swap(&y_min, &y_max); bounding_box->set_to_given_coords(x_min, y_min, x_max, y_max); return true; // Successfully read a box. }
static bool split_argv_string(const char *argv[SIZEOF_ARG], int *argc, char *cmd, bool remove_quotes) { while (*cmd && *argc < SIZEOF_ARG) { char quoted = 0; int valuelen = get_arg_valuelen(cmd, "ed); bool advance = cmd[valuelen] != 0; int quote_offset = !!(quoted && remove_quotes); cmd[valuelen - quote_offset] = 0; argv[(*argc)++] = chomp_string(cmd + quote_offset); cmd = chomp_string(cmd + valuelen + advance); } if (*argc < SIZEOF_ARG) argv[*argc] = NULL; return *argc < SIZEOF_ARG; }
bool io_read_buf(struct io *io, char buf[], size_t bufsize) { struct buffer result = {0}; if (io_get(io, &result, '\n', TRUE)) { result.data = chomp_string(result.data); string_ncopy_do(buf, bufsize, result.data, strlen(result.data)); } return io_done(io) && result.data; }
int main(int argc, char** argv) { tesseract::CheckSharedLibraryVersion(); // Parse input arguments. if (argc > 1 && (!strcmp(argv[1], "-v") || !strcmp(argv[1], "--version"))) { printf("%s\n", tesseract::TessBaseAPI::Version()); return 0; } else if (argc != 4 && (argc != 6 || strcmp(argv[1], "-l") != 0)) { printf("Usage: %s -v | --version | %s [-l lang] tessdata_dir wordlist_file" " output_ambiguous_wordlist_file\n", argv[0], argv[0]); return 1; } int argv_offset = 0; STRING lang; if (argc == 6) { lang = argv[2]; argv_offset = 2; } else { lang = "eng"; } const char *tessdata_dir = argv[++argv_offset]; const char *input_file_str = argv[++argv_offset]; const char *output_file_str = argv[++argv_offset]; // Initialize Tesseract. tesseract::TessBaseAPI api; GenericVector<STRING> vars_vec; GenericVector<STRING> vars_values; vars_vec.push_back("output_ambig_words_file"); vars_values.push_back(output_file_str); api.Init(tessdata_dir, lang.string(), tesseract::OEM_TESSERACT_ONLY, nullptr, 0, &vars_vec, &vars_values, false); tesseract::Dict &dict = api.tesseract()->getDict(); FILE *input_file = fopen(input_file_str, "rb"); if (input_file == nullptr) { tprintf("Failed to open input wordlist file %s\n", input_file_str); exit(1); } char str[CHARS_PER_LINE]; // Read word list and call Dict::NoDangerousAmbig() for each word // to record ambiguities in the output file. while (fgets(str, CHARS_PER_LINE, input_file) != nullptr) { chomp_string(str); // remove newline WERD_CHOICE word(str, dict.getUnicharset()); dict.NoDangerousAmbig(&word, nullptr, false, nullptr); } // Clean up. fclose(input_file); }
// Parses the given box file string into a page_number, utf8_str, and // bounding_box. Returns true on a successful parse. // The box file is assumed to contain box definitions, one per line, of the // following format for blob-level boxes: // <UTF8 str> <left> <bottom> <right> <top> <page id> // and for word/line-level boxes: // WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str> // See applyybox.cpp for more information. bool ParseBoxFileStr(const char* boxfile_str, int* page_number, STRING* utf8_str, TBOX* bounding_box) { *bounding_box = TBOX(); // Initialize it to empty. *utf8_str = ""; char uch[kBoxReadBufSize]; const char *buffptr = boxfile_str; // Read the unichar without messing up on Tibetan. // According to issue 253 the utf-8 surrogates 85 and A0 are treated // as whitespace by sscanf, so it is more reliable to just find // ascii space and tab. int uch_len = 0; while (*buffptr != '\0' && *buffptr != ' ' && *buffptr != '\t' && uch_len < kBoxReadBufSize - 1) { uch[uch_len++] = *buffptr++; } uch[uch_len] = '\0'; if (*buffptr != '\0') ++buffptr; int x_min, y_min, x_max, y_max; *page_number = 0; int count = sscanf(buffptr, "%d %d %d %d %d", &x_min, &y_min, &x_max, &y_max, page_number); if (count != 5 && count != 4) { tprintf("Bad box coordinates in boxfile string!\n"); return false; } // Test for long space-delimited string label. if (strcmp(uch, kMultiBlobLabelCode) == 0 && (buffptr = strchr(buffptr, '#')) != NULL) { strncpy(uch, buffptr + 1, kBoxReadBufSize); chomp_string(uch); uch_len = strlen(uch); } // Validate UTF8 by making unichars with it. int used = 0; while (used < uch_len) { UNICHAR ch(uch + used, uch_len - used); int new_used = ch.utf8_len(); if (new_used == 0) { tprintf("Bad UTF-8 str %s starts with 0x%02x at col %d\n", uch + used, uch[used], used + 1); return false; } used += new_used; } *utf8_str = uch; bounding_box->set_to_given_coords(x_min, y_min, x_max, y_max); return true; // Successfully read a box. }
static bool split_argv_string(const char *argv[SIZEOF_ARG], int *argc, char *cmd, bool remove_quotes) { while (*cmd && *argc < SIZEOF_ARG) { char *arg = parse_arg(&cmd, remove_quotes); if (!arg) break; argv[(*argc)++] = arg; cmd = chomp_string(cmd); } if (*argc < SIZEOF_ARG) argv[*argc] = NULL; return *argc < SIZEOF_ARG; }
void UnicharAmbigs::LoadUnicharAmbigs(const UNICHARSET& encoder_set, TFile *ambig_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset) { int i, j; UnicharIdVector *adaption_ambigs_entry; if (debug_level) tprintf("Reading ambiguities\n"); int test_ambig_part_size; int replacement_ambig_part_size; // The space for buffer is allocated on the heap to avoid // GCC frame size warning. const int kBufferSize = 10 + 2 * kMaxAmbigStringSize; char *buffer = new char[kBufferSize]; char replacement_string[kMaxAmbigStringSize]; UNICHAR_ID test_unichar_ids[MAX_AMBIG_SIZE + 1]; int line_num = 0; int type = NOT_AMBIG; // Determine the version of the ambigs file. int version = 0; ASSERT_HOST(ambig_file->FGets(buffer, kBufferSize) != NULL && strlen(buffer) > 0); if (*buffer == 'v') { version = static_cast<int>(strtol(buffer+1, NULL, 10)); ++line_num; } else { ambig_file->Rewind(); } while (ambig_file->FGets(buffer, kBufferSize) != NULL) { chomp_string(buffer); if (debug_level > 2) tprintf("read line %s\n", buffer); ++line_num; if (!ParseAmbiguityLine(line_num, version, debug_level, encoder_set, buffer, &test_ambig_part_size, test_unichar_ids, &replacement_ambig_part_size, replacement_string, &type)) continue; // Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST. AmbigSpec *ambig_spec = new AmbigSpec(); if (!InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_ : dang_ambigs_, test_ambig_part_size, test_unichar_ids, replacement_ambig_part_size, replacement_string, type, ambig_spec, unicharset)) continue; // Update one_to_one_definite_ambigs_. if (test_ambig_part_size == 1 && replacement_ambig_part_size == 1 && type == DEFINITE_AMBIG) { if (one_to_one_definite_ambigs_[test_unichar_ids[0]] == NULL) { one_to_one_definite_ambigs_[test_unichar_ids[0]] = new UnicharIdVector(); } one_to_one_definite_ambigs_[test_unichar_ids[0]]->push_back( ambig_spec->correct_ngram_id); } // Update ambigs_for_adaption_. if (use_ambigs_for_adaption) { GenericVector<UNICHAR_ID> encoding; // Silently ignore invalid strings, as before, so it is safe to use a // universal ambigs file. if (unicharset->encode_string(replacement_string, true, &encoding, NULL, NULL)) { for (i = 0; i < test_ambig_part_size; ++i) { if (ambigs_for_adaption_[test_unichar_ids[i]] == NULL) { ambigs_for_adaption_[test_unichar_ids[i]] = new UnicharIdVector(); } adaption_ambigs_entry = ambigs_for_adaption_[test_unichar_ids[i]]; for (int r = 0; r < encoding.size(); ++r) { UNICHAR_ID id_to_insert = encoding[r]; ASSERT_HOST(id_to_insert != INVALID_UNICHAR_ID); // Add the new unichar id to adaption_ambigs_entry (only if the // vector does not already contain it) keeping it in sorted order. for (j = 0; j < adaption_ambigs_entry->size() && (*adaption_ambigs_entry)[j] > id_to_insert; ++j); if (j < adaption_ambigs_entry->size()) { if ((*adaption_ambigs_entry)[j] != id_to_insert) { adaption_ambigs_entry->insert(id_to_insert, j); } } else { adaption_ambigs_entry->push_back(id_to_insert); } } } } } } delete[] buffer; // Fill in reverse_ambigs_for_adaption from ambigs_for_adaption vector. if (use_ambigs_for_adaption) { for (i = 0; i < ambigs_for_adaption_.size(); ++i) { adaption_ambigs_entry = ambigs_for_adaption_[i]; if (adaption_ambigs_entry == NULL) continue; for (j = 0; j < adaption_ambigs_entry->size(); ++j) { UNICHAR_ID ambig_id = (*adaption_ambigs_entry)[j]; if (reverse_ambigs_for_adaption_[ambig_id] == NULL) { reverse_ambigs_for_adaption_[ambig_id] = new UnicharIdVector(); } reverse_ambigs_for_adaption_[ambig_id]->push_back(i); } } } // Print what was read from the input file. if (debug_level > 1) { for (int tbl = 0; tbl < 2; ++tbl) { const UnicharAmbigsVector &print_table = (tbl == 0) ? replace_ambigs_ : dang_ambigs_; for (i = 0; i < print_table.size(); ++i) { AmbigSpec_LIST *lst = print_table[i]; if (lst == NULL) continue; if (!lst->empty()) { tprintf("%s Ambiguities for %s:\n", (tbl == 0) ? "Replaceable" : "Dangerous", unicharset->debug_str(i).string()); } AmbigSpec_IT lst_it(lst); for (lst_it.mark_cycle_pt(); !lst_it.cycled_list(); lst_it.forward()) { AmbigSpec *ambig_spec = lst_it.data(); tprintf("wrong_ngram:"); UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, *unicharset); tprintf("correct_fragments:"); UnicharIdArrayUtils::print(ambig_spec->correct_fragments, *unicharset); } } } if (use_ambigs_for_adaption) { for (int vec_id = 0; vec_id < 2; ++vec_id) { const GenericVector<UnicharIdVector *> &vec = (vec_id == 0) ? ambigs_for_adaption_ : reverse_ambigs_for_adaption_; for (i = 0; i < vec.size(); ++i) { adaption_ambigs_entry = vec[i]; if (adaption_ambigs_entry != NULL) { tprintf("%sAmbigs for adaption for %s:\n", (vec_id == 0) ? "" : "Reverse ", unicharset->debug_str(i).string()); for (j = 0; j < adaption_ambigs_entry->size(); ++j) { tprintf("%s ", unicharset->debug_str( (*adaption_ambigs_entry)[j]).string()); } tprintf("\n"); } } } } } }
int main(int argc, char** argv) { int min_word_length; int max_word_length; if (!(argc == 4 || (argc == 5 && strcmp(argv[1], "-t") == 0) || (argc == 6 && strcmp(argv[1], "-r") == 0) || (argc == 7 && strcmp(argv[1], "-l") == 0 && sscanf(argv[2], "%d", &min_word_length) == 1 && sscanf(argv[3], "%d", &max_word_length) == 1))) { printf("Usage: %s [-t | -r [reverse policy] |" " -l min_len max_len] word_list_file" " dawg_file unicharset_file\n", argv[0]); return 1; } tesseract::Classify *classify = new tesseract::Classify(); int argv_index = 0; if (argc == 5) ++argv_index; tesseract::Trie::RTLReversePolicy reverse_policy = tesseract::Trie::RRP_DO_NO_REVERSE; if (argc == 6) { ++argv_index; int tmp_int; sscanf(argv[++argv_index], "%d", &tmp_int); reverse_policy = static_cast<tesseract::Trie::RTLReversePolicy>(tmp_int); tprintf("Set reverse_policy to %s\n", tesseract::Trie::get_reverse_policy_name(reverse_policy)); } if (argc == 7) argv_index += 3; const char* wordlist_filename = argv[++argv_index]; const char* dawg_filename = argv[++argv_index]; const char* unicharset_file = argv[++argv_index]; tprintf("Loading unicharset from '%s'\n", unicharset_file); if (!classify->getDict().getUnicharset().load_from_file(unicharset_file)) { tprintf("Failed to load unicharset from '%s'\n", unicharset_file); delete classify; return 1; } const UNICHARSET &unicharset = classify->getDict().getUnicharset(); if (argc == 4 || argc == 6) { tesseract::Trie trie( // the first 3 arguments are not used in this case tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM, kMaxNumEdges, unicharset.size(), classify->getDict().dawg_debug_level); tprintf("Reading word list from '%s'\n", wordlist_filename); if (!trie.read_word_list(wordlist_filename, unicharset, reverse_policy)) { tprintf("Failed to read word list from '%s'\n", wordlist_filename); exit(1); } tprintf("Reducing Trie to SquishedDawg\n"); tesseract::SquishedDawg *dawg = trie.trie_to_dawg(); if (dawg != NULL && dawg->NumEdges() > 0) { tprintf("Writing squished DAWG to '%s'\n", dawg_filename); dawg->write_squished_dawg(dawg_filename); } else { tprintf("Dawg is empty, skip producing the output file\n"); } delete dawg; } else if (argc == 5) { tprintf("Loading dawg DAWG from '%s'\n", dawg_filename); tesseract::SquishedDawg words( dawg_filename, // these 3 arguments are not used in this case tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM, classify->getDict().dawg_debug_level); tprintf("Checking word list from '%s'\n", wordlist_filename); words.check_for_words(wordlist_filename, unicharset, true); } else if (argc == 7) { // Place words of different lengths in separate Dawgs. char str[CHARS_PER_LINE]; FILE *word_file = fopen(wordlist_filename, "rb"); if (word_file == NULL) { tprintf("Failed to open wordlist file %s\n", wordlist_filename); exit(1); } FILE *dawg_file = fopen(dawg_filename, "wb"); if (dawg_file == NULL) { tprintf("Failed to open dawg output file %s\n", dawg_filename); exit(1); } tprintf("Reading word list from '%s'\n", wordlist_filename); GenericVector<tesseract::Trie *> trie_vec; int i; for (i = min_word_length; i <= max_word_length; ++i) { trie_vec.push_back(new tesseract::Trie( // the first 3 arguments are not used in this case tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM, kMaxNumEdges, unicharset.size(), classify->getDict().dawg_debug_level)); } while (fgets(str, CHARS_PER_LINE, word_file) != NULL) { chomp_string(str); // remove newline int badpos; if (!unicharset.encodable_string(str, &badpos)) { tprintf("String '%s' not compatible with unicharset. " "Bad chars here: '%s'\n", str, str + badpos); continue; } WERD_CHOICE word(str, unicharset); if ((reverse_policy == tesseract::Trie::RRP_REVERSE_IF_HAS_RTL && word.has_rtl_unichar_id()) || reverse_policy == tesseract::Trie::RRP_FORCE_REVERSE) { word.reverse_and_mirror_unichar_ids(); } if (word.length() >= min_word_length && word.length() <= max_word_length && !word.contains_unichar_id(INVALID_UNICHAR_ID)) { tesseract::Trie *curr_trie = trie_vec[word.length()-min_word_length]; if (!curr_trie->word_in_dawg(word)) { if (!curr_trie->add_word_to_dawg(word)) { tprintf("Failed to add the following word to dawg:\n"); word.print(); exit(1); } if (classify->getDict().dawg_debug_level > 1) { tprintf("Added word %s of length %d\n", str, word.length()); } if (!curr_trie->word_in_dawg(word)) { tprintf("Error: word '%s' not in DAWG after adding it\n", str); exit(1); } } } } fclose(word_file); tprintf("Writing fixed length dawgs to '%s'\n", dawg_filename); GenericVector<tesseract::SquishedDawg *> dawg_vec; for (i = 0; i <= max_word_length; ++i) { dawg_vec.push_back(i < min_word_length ? NULL : trie_vec[i-min_word_length]->trie_to_dawg()); } tesseract::Dict::WriteFixedLengthDawgs( dawg_vec, max_word_length - min_word_length + 1, classify->getDict().dawg_debug_level, dawg_file); fclose(dawg_file); dawg_vec.delete_data_pointers(); trie_vec.delete_data_pointers(); } else { // should never get here tprintf("Invalid command-line options\n"); exit(1); } delete classify; return 0; }
void UnicharAmbigs::LoadUnicharAmbigs(FILE *AmbigFile, inT64 end_offset, UNICHARSET *unicharset) { int i; for (i = 0; i < unicharset->size(); ++i) { replace_ambigs_.push_back(NULL); dang_ambigs_.push_back(NULL); one_to_one_definite_ambigs_.push_back(NULL); } if (global_ambigs_debug_level) tprintf("Reading ambiguities\n"); int TestAmbigPartSize; int ReplacementAmbigPartSize; // Maximum line size: // 10 for sizes of ambigs, tabs, abmig type and newline // UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig // The space for buffer is allocated on the heap to avoid // GCC frame size warning. const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1); const int kBufferSize = 10 + 2 * kMaxAmbigStringSize; char *buffer = new char[kBufferSize]; char ReplacementString[kMaxAmbigStringSize]; UNICHAR_ID TestUnicharIds[MAX_AMBIG_SIZE + 1]; int line_num = 0; int type = NOT_AMBIG; // Determine the version of the ambigs file. int version = 0; ASSERT_HOST(fgets(buffer, kBufferSize, AmbigFile) != NULL && strlen(buffer) > 0); if (*buffer == 'v') { version = static_cast<int>(strtol(buffer+1, NULL, 10)); ++line_num; } else { rewind(AmbigFile); } while ((end_offset < 0 || ftell(AmbigFile) < end_offset) && fgets(buffer, kBufferSize, AmbigFile) != NULL) { chomp_string(buffer); if (global_ambigs_debug_level > 2) tprintf("read line %s\n", buffer); ++line_num; if (!ParseAmbiguityLine(line_num, version, *unicharset, buffer, &TestAmbigPartSize, TestUnicharIds, &ReplacementAmbigPartSize, ReplacementString, &type)) continue; // Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST. AmbigSpec *ambig_spec = new AmbigSpec(); InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_ : dang_ambigs_, TestAmbigPartSize, TestUnicharIds, ReplacementAmbigPartSize, ReplacementString, type, ambig_spec, unicharset); // Update one_to_one_definite_ambigs_. if (use_definite_ambigs_for_classifier && TestAmbigPartSize == 1 && ReplacementAmbigPartSize == 1 && type == DEFINITE_AMBIG) { if (one_to_one_definite_ambigs_[TestUnicharIds[0]] == NULL) { one_to_one_definite_ambigs_[TestUnicharIds[0]] = new UnicharIdVector(); } one_to_one_definite_ambigs_[TestUnicharIds[0]]->push_back( ambig_spec->correct_ngram_id); } } delete[] buffer; // Print what was read from the input file. if (global_ambigs_debug_level > 2) { for (int tbl = 0; tbl < 2; ++tbl) { const UnicharAmbigsVector &print_table = (tbl == 0) ? replace_ambigs_ : dang_ambigs_; for (i = 0; i < print_table.size(); ++i) { AmbigSpec_LIST *lst = print_table[i]; if (lst == NULL) continue; if (!lst->empty()) { tprintf("%s Ambiguities for %s:\n", (tbl == 0) ? "Replaceable" : "Dangerous", unicharset->debug_str(i).string()); } AmbigSpec_IT lst_it(lst); for (lst_it.mark_cycle_pt(); !lst_it.cycled_list(); lst_it.forward()) { AmbigSpec *ambig_spec = lst_it.data(); tprintf("wrong_ngram:"); UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, *unicharset); tprintf("correct_fragments:"); UnicharIdArrayUtils::print(ambig_spec->correct_fragments, *unicharset); } } } } }
// As read_next_box above, but get a specific page number. (0-based) // Use -1 to read any page number. Files without page number all // read as if they are page 0. bool read_next_box(int target_page, int *line_number, FILE* box_file, char* utf8_str, int* x_min, int* y_min, int* x_max, int* y_max) { int count = 0; int page = 0; char buff[kBoxReadBufSize]; // boxfile read buffer char uch[kBoxReadBufSize]; char *buffptr = buff; while (fgets(buff, sizeof(buff) - 1, box_file)) { (*line_number)++; buffptr = buff; const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr); if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf) buffptr += 3; // Skip unicode file designation. // Check for blank lines in box file while (*buffptr == ' ' || *buffptr == '\t') buffptr++; if (*buffptr != '\0') { // Read the unichar without messing up on Tibetan. // According to issue 253 the utf-8 surrogates 85 and A0 are treated // as whitespace by sscanf, so it is more reliable to just find // ascii space and tab. int uch_len = 0; while (*buffptr != '\0' && *buffptr != ' ' && *buffptr != '\t') uch[uch_len++] = *buffptr++; uch[uch_len] = '\0'; if (*buffptr != '\0') ++buffptr; count = sscanf(buffptr, "%d %d %d %d %d", x_min, y_min, x_max, y_max, &page); if (count != 5) { if (target_page <= 0) { // If target_page is negative or zero, allow lines with no page number page = 0; count = sscanf(buffptr, "%d %d %d %d", x_min, y_min, x_max, y_max); } else { tprintf("Box file format error on line %i; ignored\n", *line_number); continue; } } if (target_page >= 0 && target_page != page) continue; // Not on the appropriate page. // Test for long space-delimited string label. if (strcmp(uch, kMultiBlobLabelCode) == 0 && (buffptr = strchr(buffptr, '#')) != NULL) { strcpy(uch, buffptr + 1); chomp_string(uch); uch_len = strlen(uch); } // Validate UTF8 by making unichars with it. int used = 0; while (used < uch_len) { UNICHAR ch(uch + used, uch_len - used); int new_used = ch.utf8_len(); if (new_used == 0) { tprintf("Bad UTF-8 str %s starts with 0x%02x at line %d, col %d\n", uch + used, uch[used], *line_number, used + 1); count = 0; break; } used += new_used; } if (count < 4 || used == 0) { tprintf("Box file format error on line %i; ignored\n", *line_number); } else { strncpy(utf8_str, uch, kBoxReadBufSize); return true; // Successfully read a box. } } } fclose(box_file); return false; // EOF }