Пример #1
0
Файл: io.c Проект: Oblomov/tig
static int
io_load_file(struct io *io, const char *separators,
	     size_t *lineno, io_read_fn read_property, void *data)
{
	struct buffer buf;
	int state = OK;

	while (state == OK && io_get_line(io, &buf, '\n', lineno, TRUE)) {
		char *name;
		char *value;
		size_t namelen;
		size_t valuelen;

		name = chomp_string(buf.data);
		namelen = strcspn(name, separators);

		if (name[namelen]) {
			name[namelen] = 0;
			value = chomp_string(name + namelen + 1);
			valuelen = strlen(value);

		} else {
			value = "";
			valuelen = 0;
		}

		state = read_property(name, namelen, value, valuelen, data);
	}

	if (state != ERR && io_error(io))
		state = ERR;
	io_done(io);

	return state;
}
Пример #2
0
void
parse_author_line(char *ident, const struct ident **author, struct time *time)
{
	char *nameend = strchr(ident, '<');
	char *emailend = strchr(ident, '>');
	const char *name, *email = "";

	if (nameend && emailend)
		*nameend = *emailend = 0;
	name = chomp_string(ident);
	if (nameend)
		email = chomp_string(nameend + 1);
	if (!*name)
		name = *email ? email : unknown_ident.name;
	if (!*email)
		email = *name ? name : unknown_ident.email;

	*author = get_author(name, email);

	/* Parse epoch and timezone */
	if (time && emailend && emailend[1] == ' ') {
		char *secs = emailend + 2;
		char *zone = strchr(secs, ' ');

		parse_timesec(time, secs);

		if (zone && strlen(zone) == STRING_SIZE(" +0700"))
			parse_timezone(time, zone + 1);
	}
}
Пример #3
0
bool ParamUtils::ReadParamsFromFp(FILE *fp, inT64 end_offset,
                                  SetParamConstraint constraint,
                                  ParamsVectors *member_params) {
  char line[MAX_PATH];           // input line
  bool anyerr = false;           // true if any error
  bool foundit;                  // found parameter
  inT16 length;                  // length of line
  char *valptr;                  // value field

  while ((end_offset < 0 || ftell(fp) < end_offset) &&
         fgets(line, MAX_PATH, fp)) {
    if (line[0] != '\n' && line[0] != '#') {
      length = strlen (line);
      chomp_string(line);  // remove newline
      for (valptr = line; *valptr && *valptr != ' ' && *valptr != '\t';
        valptr++);
      if (*valptr) {             // found blank
        *valptr = '\0';          // make name a string
        do
          valptr++;              // find end of blanks
        while (*valptr == ' ' || *valptr == '\t');
      }
      foundit = SetParam(line, valptr, constraint, member_params);

      if (!foundit) {
        anyerr = true;         // had an error
        tprintf("read_params_file: parameter not found: %s\n", line);
        exit(1);
      }
    }
  }
  return anyerr;
}
Пример #4
0
bool ParamUtils::ReadParamsFromFp(SetParamConstraint constraint, TFile *fp,
                                  ParamsVectors *member_params) {
  char line[MAX_PATH];           // input line
  bool anyerr = false;           // true if any error
  bool foundit;                  // found parameter
  char *valptr;                  // value field

  while (fp->FGets(line, MAX_PATH) != nullptr) {
    if (line[0] != '\r' && line[0] != '\n' && line[0] != '#') {
      chomp_string(line);  // remove newline
      for (valptr = line; *valptr && *valptr != ' ' && *valptr != '\t';
        valptr++);
      if (*valptr) {             // found blank
        *valptr = '\0';          // make name a string
        do
          valptr++;              // find end of blanks
        while (*valptr == ' ' || *valptr == '\t');
      }
      foundit = SetParam(line, valptr, constraint, member_params);

      if (!foundit) {
        anyerr = true;         // had an error
        tprintf("Warning: Parameter not found: %s\n", line);
      }
    }
  }
  return anyerr;
}
Пример #5
0
int Dawg::check_for_words(const char *filename,
                          const UNICHARSET &unicharset,
                          bool enable_wildcard) const {
  if (filename == nullptr) return 0;

  FILE       *word_file;
  char       string [CHARS_PER_LINE];
  int misses = 0;
  UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard);

  word_file = fopen(filename, "r");
  if (word_file == nullptr) {
    tprintf("Error: Could not open file %s\n", filename);
    ASSERT_HOST(word_file);
  }

  while (fgets (string, CHARS_PER_LINE, word_file) != nullptr) {
    chomp_string(string);  // remove newline
    WERD_CHOICE word(string, unicharset);
    if (word.length() > 0 &&
        !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
      if (!match_words(&word, 0, 0,
                       enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) {
        tprintf("Missing word: %s\n", string);
        ++misses;
      }
    } else {
      tprintf("Failed to create a valid word from %s\n", string);
    }
  }
  fclose (word_file);
  // Make sure the user sees this with fprintf instead of tprintf.
  if (debug_level_) tprintf("Number of lost words=%d\n", misses);
  return misses;
}
Пример #6
0
// Parses the given box file string into a page_number, utf8_str, and
// bounding_box. Returns true on a successful parse.
// The box file is assumed to contain box definitions, one per line, of the
// following format for blob-level boxes:
//   <UTF8 str> <left> <bottom> <right> <top> <page id>
// and for word/line-level boxes:
//   WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
// See applyybox.cpp for more information.
bool ParseBoxFileStr(const char* boxfile_str, int* page_number,
                     STRING* utf8_str, TBOX* bounding_box) {
  *bounding_box = TBOX();       // Initialize it to empty.
  *utf8_str = "";
  char uch[kBoxReadBufSize];
  const char *buffptr = boxfile_str;
  // Read the unichar without messing up on Tibetan.
  // According to issue 253 the utf-8 surrogates 85 and A0 are treated
  // as whitespace by sscanf, so it is more reliable to just find
  // ascii space and tab.
  int uch_len = 0;
  // Skip unicode file designation, if present.
  const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr);
  if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
      buffptr += 3;
  // Allow a single blank as the UTF-8 string. Check for empty string and
  // then blindly eat the first character.
  if (*buffptr == '\0') return false;
  do {
    uch[uch_len++] = *buffptr++;
  } while (*buffptr != '\0' && *buffptr != ' ' && *buffptr != '\t' &&
           uch_len < kBoxReadBufSize - 1);
  uch[uch_len] = '\0';
  if (*buffptr != '\0') ++buffptr;
  int x_min, y_min, x_max, y_max;
  *page_number = 0;
  int count = sscanf(buffptr, "%d %d %d %d %d",
                 &x_min, &y_min, &x_max, &y_max, page_number);
  if (count != 5 && count != 4) {
    tprintf("Bad box coordinates in boxfile string! %s\n", ubuf);
    return false;
  }
  // Test for long space-delimited string label.
  if (strcmp(uch, kMultiBlobLabelCode) == 0 &&
      (buffptr = strchr(buffptr, '#')) != NULL) {
    strncpy(uch, buffptr + 1, kBoxReadBufSize - 1);
    uch[kBoxReadBufSize - 1] = '\0';  // Prevent buffer overrun.
    chomp_string(uch);
    uch_len = strlen(uch);
  }
  // Validate UTF8 by making unichars with it.
  int used = 0;
  while (used < uch_len) {
    UNICHAR ch(uch + used, uch_len - used);
    int new_used = ch.utf8_len();
    if (new_used == 0) {
      tprintf("Bad UTF-8 str %s starts with 0x%02x at col %d\n",
              uch + used, uch[used], used + 1);
      return false;
    }
    used += new_used;
  }
  *utf8_str = uch;
  if (x_min > x_max) Swap(&x_min, &x_max);
  if (y_min > y_max) Swap(&y_min, &y_max);
  bounding_box->set_to_given_coords(x_min, y_min, x_max, y_max);
  return true;  // Successfully read a box.
}
Пример #7
0
Файл: argv.c Проект: apieum/tig
static bool
split_argv_string(const char *argv[SIZEOF_ARG], int *argc, char *cmd, bool remove_quotes)
{
	while (*cmd && *argc < SIZEOF_ARG) {
		char quoted = 0;
		int valuelen = get_arg_valuelen(cmd, &quoted);
		bool advance = cmd[valuelen] != 0;
		int quote_offset = !!(quoted && remove_quotes);

		cmd[valuelen - quote_offset] = 0;
		argv[(*argc)++] = chomp_string(cmd + quote_offset);
		cmd = chomp_string(cmd + valuelen + advance);
	}

	if (*argc < SIZEOF_ARG)
		argv[*argc] = NULL;
	return *argc < SIZEOF_ARG;
}
Пример #8
0
Файл: io.c Проект: Oblomov/tig
bool
io_read_buf(struct io *io, char buf[], size_t bufsize)
{
	struct buffer result = {0};

	if (io_get(io, &result, '\n', TRUE)) {
		result.data = chomp_string(result.data);
		string_ncopy_do(buf, bufsize, result.data, strlen(result.data));
	}

	return io_done(io) && result.data;
}
Пример #9
0
int main(int argc, char** argv) {
  tesseract::CheckSharedLibraryVersion();

  // Parse input arguments.
  if (argc > 1 && (!strcmp(argv[1], "-v") || !strcmp(argv[1], "--version"))) {
    printf("%s\n", tesseract::TessBaseAPI::Version());
    return 0;
  } else if (argc != 4 && (argc != 6 || strcmp(argv[1], "-l") != 0)) {
    printf("Usage: %s -v | --version | %s [-l lang] tessdata_dir wordlist_file"
           " output_ambiguous_wordlist_file\n", argv[0], argv[0]);
    return 1;
  }
  int argv_offset = 0;
  STRING lang;
  if (argc == 6) {
    lang = argv[2];
    argv_offset = 2;
  } else {
    lang = "eng";
  }
  const char *tessdata_dir = argv[++argv_offset];
  const char *input_file_str = argv[++argv_offset];
  const char *output_file_str = argv[++argv_offset];

  // Initialize Tesseract.
  tesseract::TessBaseAPI api;
  GenericVector<STRING> vars_vec;
  GenericVector<STRING> vars_values;
  vars_vec.push_back("output_ambig_words_file");
  vars_values.push_back(output_file_str);
  api.Init(tessdata_dir, lang.string(), tesseract::OEM_TESSERACT_ONLY, nullptr,
           0, &vars_vec, &vars_values, false);
  tesseract::Dict &dict = api.tesseract()->getDict();
  FILE *input_file = fopen(input_file_str, "rb");
  if (input_file == nullptr) {
    tprintf("Failed to open input wordlist file %s\n", input_file_str);
    exit(1);
  }
  char str[CHARS_PER_LINE];

  // Read word list and call Dict::NoDangerousAmbig() for each word
  // to record ambiguities in the output file.
  while (fgets(str, CHARS_PER_LINE, input_file) != nullptr) {
    chomp_string(str);  // remove newline
    WERD_CHOICE word(str, dict.getUnicharset());
    dict.NoDangerousAmbig(&word, nullptr, false, nullptr);
  }
  // Clean up.
  fclose(input_file);
}
Пример #10
0
// Parses the given box file string into a page_number, utf8_str, and
// bounding_box. Returns true on a successful parse.
// The box file is assumed to contain box definitions, one per line, of the
// following format for blob-level boxes:
//   <UTF8 str> <left> <bottom> <right> <top> <page id>
// and for word/line-level boxes:
//   WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
// See applyybox.cpp for more information.
bool ParseBoxFileStr(const char* boxfile_str, int* page_number,
                     STRING* utf8_str, TBOX* bounding_box) {
  *bounding_box = TBOX();       // Initialize it to empty.
  *utf8_str = "";
  char uch[kBoxReadBufSize];
  const char *buffptr = boxfile_str;
  // Read the unichar without messing up on Tibetan.
  // According to issue 253 the utf-8 surrogates 85 and A0 are treated
  // as whitespace by sscanf, so it is more reliable to just find
  // ascii space and tab.
  int uch_len = 0;
  while (*buffptr != '\0' && *buffptr != ' ' && *buffptr != '\t' &&
         uch_len < kBoxReadBufSize - 1) {
    uch[uch_len++] = *buffptr++;
  }
  uch[uch_len] = '\0';
  if (*buffptr != '\0') ++buffptr;
  int x_min, y_min, x_max, y_max;
  *page_number = 0;
  int count = sscanf(buffptr, "%d %d %d %d %d",
                 &x_min, &y_min, &x_max, &y_max, page_number);
  if (count != 5 && count != 4) {
    tprintf("Bad box coordinates in boxfile string!\n");
    return false;
  }
  // Test for long space-delimited string label.
  if (strcmp(uch, kMultiBlobLabelCode) == 0 &&
      (buffptr = strchr(buffptr, '#')) != NULL) {
    strncpy(uch, buffptr + 1, kBoxReadBufSize);
    chomp_string(uch);
    uch_len = strlen(uch);
  }
  // Validate UTF8 by making unichars with it.
  int used = 0;
  while (used < uch_len) {
    UNICHAR ch(uch + used, uch_len - used);
    int new_used = ch.utf8_len();
    if (new_used == 0) {
      tprintf("Bad UTF-8 str %s starts with 0x%02x at col %d\n",
              uch + used, uch[used], used + 1);
      return false;
    }
    used += new_used;
  }
  *utf8_str = uch;
  bounding_box->set_to_given_coords(x_min, y_min, x_max, y_max);
  return true;  // Successfully read a box.
}
Пример #11
0
Файл: argv.c Проект: Oblomov/tig
static bool
split_argv_string(const char *argv[SIZEOF_ARG], int *argc, char *cmd, bool remove_quotes)
{
	while (*cmd && *argc < SIZEOF_ARG) {
		char *arg = parse_arg(&cmd, remove_quotes);

		if (!arg)
			break;
		argv[(*argc)++] = arg;
		cmd = chomp_string(cmd);
	}

	if (*argc < SIZEOF_ARG)
		argv[*argc] = NULL;
	return *argc < SIZEOF_ARG;
}
Пример #12
0
void UnicharAmbigs::LoadUnicharAmbigs(const UNICHARSET& encoder_set,
                                      TFile *ambig_file,
                                      int debug_level,
                                      bool use_ambigs_for_adaption,
                                      UNICHARSET *unicharset) {
  int i, j;
  UnicharIdVector *adaption_ambigs_entry;
  if (debug_level) tprintf("Reading ambiguities\n");

  int test_ambig_part_size;
  int replacement_ambig_part_size;
  // The space for buffer is allocated on the heap to avoid
  // GCC frame size warning.
  const int kBufferSize = 10 + 2 * kMaxAmbigStringSize;
  char *buffer = new char[kBufferSize];
  char replacement_string[kMaxAmbigStringSize];
  UNICHAR_ID test_unichar_ids[MAX_AMBIG_SIZE + 1];
  int line_num = 0;
  int type = NOT_AMBIG;

  // Determine the version of the ambigs file.
  int version = 0;
  ASSERT_HOST(ambig_file->FGets(buffer, kBufferSize) != NULL &&
              strlen(buffer) > 0);
  if (*buffer == 'v') {
    version = static_cast<int>(strtol(buffer+1, NULL, 10));
    ++line_num;
  } else {
    ambig_file->Rewind();
  }
  while (ambig_file->FGets(buffer, kBufferSize) != NULL) {
    chomp_string(buffer);
    if (debug_level > 2) tprintf("read line %s\n", buffer);
    ++line_num;
    if (!ParseAmbiguityLine(line_num, version, debug_level, encoder_set,
                            buffer, &test_ambig_part_size, test_unichar_ids,
                            &replacement_ambig_part_size,
                            replacement_string, &type)) continue;
    // Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST.
    AmbigSpec *ambig_spec = new AmbigSpec();
    if (!InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_
                                                 : dang_ambigs_,
                         test_ambig_part_size, test_unichar_ids,
                         replacement_ambig_part_size, replacement_string, type,
                         ambig_spec, unicharset))
      continue;

    // Update one_to_one_definite_ambigs_.
    if (test_ambig_part_size == 1 &&
        replacement_ambig_part_size == 1 && type == DEFINITE_AMBIG) {
      if (one_to_one_definite_ambigs_[test_unichar_ids[0]] == NULL) {
        one_to_one_definite_ambigs_[test_unichar_ids[0]] = new UnicharIdVector();
      }
      one_to_one_definite_ambigs_[test_unichar_ids[0]]->push_back(
          ambig_spec->correct_ngram_id);
    }
    // Update ambigs_for_adaption_.
    if (use_ambigs_for_adaption) {
      GenericVector<UNICHAR_ID> encoding;
      // Silently ignore invalid strings, as before, so it is safe to use a
      // universal ambigs file.
      if (unicharset->encode_string(replacement_string, true, &encoding,
                                    NULL, NULL)) {
        for (i = 0; i < test_ambig_part_size; ++i) {
          if (ambigs_for_adaption_[test_unichar_ids[i]] == NULL) {
            ambigs_for_adaption_[test_unichar_ids[i]] = new UnicharIdVector();
          }
          adaption_ambigs_entry = ambigs_for_adaption_[test_unichar_ids[i]];
          for (int r = 0; r < encoding.size(); ++r) {
            UNICHAR_ID id_to_insert = encoding[r];
            ASSERT_HOST(id_to_insert != INVALID_UNICHAR_ID);
            // Add the new unichar id to adaption_ambigs_entry (only if the
            // vector does not already contain it) keeping it in sorted order.
            for (j = 0; j < adaption_ambigs_entry->size() &&
                 (*adaption_ambigs_entry)[j] > id_to_insert; ++j);
            if (j < adaption_ambigs_entry->size()) {
              if ((*adaption_ambigs_entry)[j] != id_to_insert) {
                adaption_ambigs_entry->insert(id_to_insert, j);
              }
            } else {
              adaption_ambigs_entry->push_back(id_to_insert);
            }
          }
        }
      }
    }
  }
  delete[] buffer;

  // Fill in reverse_ambigs_for_adaption from ambigs_for_adaption vector.
  if (use_ambigs_for_adaption) {
    for (i = 0; i < ambigs_for_adaption_.size(); ++i) {
      adaption_ambigs_entry = ambigs_for_adaption_[i];
      if (adaption_ambigs_entry == NULL) continue;
      for (j = 0; j < adaption_ambigs_entry->size(); ++j) {
        UNICHAR_ID ambig_id = (*adaption_ambigs_entry)[j];
        if (reverse_ambigs_for_adaption_[ambig_id] == NULL) {
          reverse_ambigs_for_adaption_[ambig_id] = new UnicharIdVector();
        }
        reverse_ambigs_for_adaption_[ambig_id]->push_back(i);
      }
    }
  }

  // Print what was read from the input file.
  if (debug_level > 1) {
    for (int tbl = 0; tbl < 2; ++tbl) {
      const UnicharAmbigsVector &print_table =
        (tbl == 0) ? replace_ambigs_ : dang_ambigs_;
      for (i = 0; i < print_table.size(); ++i) {
        AmbigSpec_LIST *lst = print_table[i];
        if (lst == NULL) continue;
        if (!lst->empty()) {
          tprintf("%s Ambiguities for %s:\n",
                  (tbl == 0) ? "Replaceable" : "Dangerous",
                  unicharset->debug_str(i).string());
        }
        AmbigSpec_IT lst_it(lst);
        for (lst_it.mark_cycle_pt(); !lst_it.cycled_list(); lst_it.forward()) {
          AmbigSpec *ambig_spec = lst_it.data();
          tprintf("wrong_ngram:");
          UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, *unicharset);
          tprintf("correct_fragments:");
          UnicharIdArrayUtils::print(ambig_spec->correct_fragments, *unicharset);
        }
      }
    }
    if (use_ambigs_for_adaption) {
      for (int vec_id = 0; vec_id < 2; ++vec_id) {
        const GenericVector<UnicharIdVector *> &vec = (vec_id == 0) ?
          ambigs_for_adaption_ : reverse_ambigs_for_adaption_;
        for (i = 0; i < vec.size(); ++i) {
          adaption_ambigs_entry = vec[i];
          if (adaption_ambigs_entry != NULL) {
            tprintf("%sAmbigs for adaption for %s:\n",
                    (vec_id == 0) ? "" : "Reverse ",
                    unicharset->debug_str(i).string());
            for (j = 0; j < adaption_ambigs_entry->size(); ++j) {
              tprintf("%s ", unicharset->debug_str(
                  (*adaption_ambigs_entry)[j]).string());
            }
            tprintf("\n");
          }
        }
      }
    }
  }
}
Пример #13
0
int main(int argc, char** argv) {
  int min_word_length;
  int max_word_length;
  if (!(argc == 4 || (argc == 5 && strcmp(argv[1], "-t") == 0) ||
      (argc == 6 && strcmp(argv[1], "-r") == 0) ||
      (argc == 7 && strcmp(argv[1], "-l") == 0 &&
         sscanf(argv[2], "%d", &min_word_length) == 1 &&
         sscanf(argv[3], "%d", &max_word_length) == 1))) {
    printf("Usage: %s [-t | -r [reverse policy] |"
           " -l min_len max_len] word_list_file"
           " dawg_file unicharset_file\n", argv[0]);
    return 1;
  }
  tesseract::Classify *classify = new tesseract::Classify();
  int argv_index = 0;
  if (argc == 5) ++argv_index;
  tesseract::Trie::RTLReversePolicy reverse_policy =
      tesseract::Trie::RRP_DO_NO_REVERSE;
  if (argc == 6) {
    ++argv_index;
    int tmp_int;
    sscanf(argv[++argv_index], "%d", &tmp_int);
    reverse_policy = static_cast<tesseract::Trie::RTLReversePolicy>(tmp_int);
    tprintf("Set reverse_policy to %s\n",
            tesseract::Trie::get_reverse_policy_name(reverse_policy));
  }
  if (argc == 7) argv_index += 3;
  const char* wordlist_filename = argv[++argv_index];
  const char* dawg_filename = argv[++argv_index];
  const char* unicharset_file = argv[++argv_index];
  tprintf("Loading unicharset from '%s'\n", unicharset_file);
  if (!classify->getDict().getUnicharset().load_from_file(unicharset_file)) {
    tprintf("Failed to load unicharset from '%s'\n", unicharset_file);
    delete classify;
    return 1;
  }
  const UNICHARSET &unicharset = classify->getDict().getUnicharset();
  if (argc == 4 || argc == 6) {
    tesseract::Trie trie(
        // the first 3 arguments are not used in this case
        tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM,
        kMaxNumEdges, unicharset.size(),
        classify->getDict().dawg_debug_level);
    tprintf("Reading word list from '%s'\n", wordlist_filename);
    if (!trie.read_word_list(wordlist_filename, unicharset, reverse_policy)) {
      tprintf("Failed to read word list from '%s'\n", wordlist_filename);
      exit(1);
    }
    tprintf("Reducing Trie to SquishedDawg\n");
    tesseract::SquishedDawg *dawg = trie.trie_to_dawg();
    if (dawg != NULL && dawg->NumEdges() > 0) {
      tprintf("Writing squished DAWG to '%s'\n", dawg_filename);
      dawg->write_squished_dawg(dawg_filename);
    } else {
      tprintf("Dawg is empty, skip producing the output file\n");
    }
    delete dawg;
  } else if (argc == 5) {
    tprintf("Loading dawg DAWG from '%s'\n", dawg_filename);
    tesseract::SquishedDawg words(
        dawg_filename,
        // these 3 arguments are not used in this case
        tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM,
        classify->getDict().dawg_debug_level);
    tprintf("Checking word list from '%s'\n", wordlist_filename);
    words.check_for_words(wordlist_filename, unicharset, true);
  } else if (argc == 7) {
    // Place words of different lengths in separate Dawgs.
    char str[CHARS_PER_LINE];
    FILE *word_file = fopen(wordlist_filename, "rb");
    if (word_file == NULL) {
      tprintf("Failed to open wordlist file %s\n", wordlist_filename);
      exit(1);
    }
    FILE *dawg_file = fopen(dawg_filename, "wb");
    if (dawg_file == NULL) {
      tprintf("Failed to open dawg output file %s\n", dawg_filename);
      exit(1);
    }
    tprintf("Reading word list from '%s'\n", wordlist_filename);
    GenericVector<tesseract::Trie *> trie_vec;
    int i;
    for (i = min_word_length; i <= max_word_length; ++i) {
      trie_vec.push_back(new tesseract::Trie(
          // the first 3 arguments are not used in this case
          tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM,
          kMaxNumEdges, unicharset.size(),
          classify->getDict().dawg_debug_level));
    }
    while (fgets(str, CHARS_PER_LINE, word_file) != NULL) {
      chomp_string(str);  // remove newline
      int badpos;
      if (!unicharset.encodable_string(str, &badpos)) {
        tprintf("String '%s' not compatible with unicharset. "
                "Bad chars here: '%s'\n", str, str + badpos);
        continue;
      }
      WERD_CHOICE word(str, unicharset);
      if ((reverse_policy == tesseract::Trie::RRP_REVERSE_IF_HAS_RTL &&
          word.has_rtl_unichar_id()) ||
          reverse_policy == tesseract::Trie::RRP_FORCE_REVERSE) {
        word.reverse_and_mirror_unichar_ids();
      }
      if (word.length() >= min_word_length &&
          word.length() <= max_word_length &&
          !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
        tesseract::Trie *curr_trie = trie_vec[word.length()-min_word_length];
        if (!curr_trie->word_in_dawg(word)) {
          if (!curr_trie->add_word_to_dawg(word)) {
            tprintf("Failed to add the following word to dawg:\n");
            word.print();
            exit(1);
          }
          if (classify->getDict().dawg_debug_level > 1) {
            tprintf("Added word %s of length %d\n", str, word.length());
          }
          if (!curr_trie->word_in_dawg(word)) {
            tprintf("Error: word '%s' not in DAWG after adding it\n", str);
            exit(1);
          }
        }
      }
    }
    fclose(word_file);
    tprintf("Writing fixed length dawgs to '%s'\n", dawg_filename);
    GenericVector<tesseract::SquishedDawg *> dawg_vec;
    for (i = 0; i <= max_word_length; ++i) {
      dawg_vec.push_back(i < min_word_length ? NULL :
                         trie_vec[i-min_word_length]->trie_to_dawg());
    }
    tesseract::Dict::WriteFixedLengthDawgs(
        dawg_vec, max_word_length - min_word_length + 1,
        classify->getDict().dawg_debug_level, dawg_file);
    fclose(dawg_file);
    dawg_vec.delete_data_pointers();
    trie_vec.delete_data_pointers();
  } else {  // should never get here
    tprintf("Invalid command-line options\n");
    exit(1);
  }
  delete classify;
  return 0;
}
Пример #14
0
void UnicharAmbigs::LoadUnicharAmbigs(FILE *AmbigFile, inT64 end_offset,
                                      UNICHARSET *unicharset) {
  int i;
  for (i = 0; i < unicharset->size(); ++i) {
    replace_ambigs_.push_back(NULL);
    dang_ambigs_.push_back(NULL);
    one_to_one_definite_ambigs_.push_back(NULL);
  }
  if (global_ambigs_debug_level) tprintf("Reading ambiguities\n");

  int TestAmbigPartSize;
  int ReplacementAmbigPartSize;
  // Maximum line size:
  //   10 for sizes of ambigs, tabs, abmig type and newline
  //   UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig
  // The space for buffer is allocated on the heap to avoid
  // GCC frame size warning.
  const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1);
  const int kBufferSize = 10 + 2 * kMaxAmbigStringSize;
  char *buffer = new char[kBufferSize];
  char ReplacementString[kMaxAmbigStringSize];
  UNICHAR_ID TestUnicharIds[MAX_AMBIG_SIZE + 1];
  int line_num = 0;
  int type = NOT_AMBIG;

  // Determine the version of the ambigs file.
  int version = 0;
  ASSERT_HOST(fgets(buffer, kBufferSize, AmbigFile) != NULL &&
              strlen(buffer) > 0);
  if (*buffer == 'v') {
    version = static_cast<int>(strtol(buffer+1, NULL, 10));
    ++line_num;
  } else {
    rewind(AmbigFile);
  }
  while ((end_offset < 0 || ftell(AmbigFile) < end_offset) &&
         fgets(buffer, kBufferSize, AmbigFile) != NULL) {
    chomp_string(buffer);
    if (global_ambigs_debug_level > 2) tprintf("read line %s\n", buffer);
    ++line_num;
    if (!ParseAmbiguityLine(line_num, version, *unicharset, buffer,
                            &TestAmbigPartSize, TestUnicharIds,
                            &ReplacementAmbigPartSize,
                            ReplacementString, &type)) continue;
    // Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST.
    AmbigSpec *ambig_spec = new AmbigSpec();
    InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_ : dang_ambigs_,
                    TestAmbigPartSize, TestUnicharIds,
                    ReplacementAmbigPartSize, ReplacementString, type,
                    ambig_spec, unicharset);

    // Update one_to_one_definite_ambigs_.
    if (use_definite_ambigs_for_classifier && TestAmbigPartSize == 1 &&
        ReplacementAmbigPartSize == 1 && type == DEFINITE_AMBIG) {
      if (one_to_one_definite_ambigs_[TestUnicharIds[0]] == NULL) {
        one_to_one_definite_ambigs_[TestUnicharIds[0]] = new UnicharIdVector();
      }
      one_to_one_definite_ambigs_[TestUnicharIds[0]]->push_back(
          ambig_spec->correct_ngram_id);
    }
  }
  delete[] buffer;
  // Print what was read from the input file.
  if (global_ambigs_debug_level > 2) {
    for (int tbl = 0; tbl < 2; ++tbl) {
      const UnicharAmbigsVector &print_table =
        (tbl == 0) ? replace_ambigs_ : dang_ambigs_;
      for (i = 0; i < print_table.size(); ++i) {
        AmbigSpec_LIST *lst = print_table[i];
        if (lst == NULL) continue;
        if (!lst->empty()) {
          tprintf("%s Ambiguities for %s:\n",
                  (tbl == 0) ? "Replaceable" : "Dangerous",
                  unicharset->debug_str(i).string());
        }
        AmbigSpec_IT lst_it(lst);
        for (lst_it.mark_cycle_pt(); !lst_it.cycled_list(); lst_it.forward()) {
          AmbigSpec *ambig_spec = lst_it.data();
          tprintf("wrong_ngram:");
          UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, *unicharset);
          tprintf("correct_fragments:");
          UnicharIdArrayUtils::print(ambig_spec->correct_fragments, *unicharset);
        }
      }
    }
  }
}
Пример #15
0
// As read_next_box above, but get a specific page number. (0-based)
// Use -1 to read any page number. Files without page number all
// read as if they are page 0.
bool read_next_box(int target_page, int *line_number,
                   FILE* box_file, char* utf8_str,
                   int* x_min, int* y_min, int* x_max, int* y_max) {
  int count = 0;
  int page = 0;
  char buff[kBoxReadBufSize];   // boxfile read buffer
  char uch[kBoxReadBufSize];
  char *buffptr = buff;

  while (fgets(buff, sizeof(buff) - 1, box_file)) {
    (*line_number)++;

    buffptr = buff;
    const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr);
    if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
      buffptr += 3;  // Skip unicode file designation.
    // Check for blank lines in box file
    while (*buffptr == ' ' || *buffptr == '\t')
      buffptr++;
    if (*buffptr != '\0') {
      // Read the unichar without messing up on Tibetan.
      // According to issue 253 the utf-8 surrogates 85 and A0 are treated
      // as whitespace by sscanf, so it is more reliable to just find
      // ascii space and tab.
      int uch_len = 0;
      while (*buffptr != '\0' && *buffptr != ' ' && *buffptr != '\t')
        uch[uch_len++] = *buffptr++;
      uch[uch_len] = '\0';
      if (*buffptr != '\0') ++buffptr;
      count = sscanf(buffptr, "%d %d %d %d %d",
                     x_min, y_min, x_max, y_max, &page);
      if (count != 5) {
        if (target_page <= 0) {
          // If target_page is negative or zero, allow lines with no page number
          page = 0;
          count = sscanf(buffptr, "%d %d %d %d", x_min, y_min, x_max, y_max);
        } else {
          tprintf("Box file format error on line %i; ignored\n", *line_number);
          continue;
        }
      }
      if (target_page >= 0 && target_page != page)
        continue;  // Not on the appropriate page.
      // Test for long space-delimited string label.
      if (strcmp(uch, kMultiBlobLabelCode) == 0 &&
          (buffptr = strchr(buffptr, '#')) != NULL) {
        strcpy(uch, buffptr + 1);
        chomp_string(uch);
        uch_len = strlen(uch);
      }
      // Validate UTF8 by making unichars with it.
      int used = 0;
      while (used < uch_len) {
        UNICHAR ch(uch + used, uch_len - used);
        int new_used = ch.utf8_len();
        if (new_used == 0) {
          tprintf("Bad UTF-8 str %s starts with 0x%02x at line %d, col %d\n",
                  uch + used, uch[used], *line_number, used + 1);
          count = 0;
          break;
        }
        used += new_used;
      }
      if (count < 4 || used == 0) {
        tprintf("Box file format error on line %i; ignored\n", *line_number);
      } else {
        strncpy(utf8_str, uch, kBoxReadBufSize);
        return true;  // Successfully read a box.
      }
    }
  }
  fclose(box_file);
  return false;  // EOF
}