Esempio n. 1
0
UNICHAR_ID wc_to_unichar_id(const UNICHARSET &unicharset, int wc) {
  UNICHAR uch(wc);
  char *unichar = uch.utf8_str();
  UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar);
  delete[] unichar;
  return unichar_id;
}
/**
 * WERD_CHOICE::init
 *
 * Helper function to build a WERD_CHOICE from the given string,
 * fragment lengths, rating, certainty and permuter.
 *
 * The function assumes that src_string is not NULL.
 * src_lengths argument could be NULL, in which case the unichars
 * in src_string are assumed to all be of length 1.
 */
void WERD_CHOICE::init(const char *src_string,
                       const char *src_lengths,
                       float src_rating,
                       float src_certainty,
                       uinT8 src_permuter,
                       const UNICHARSET &unicharset) {
  int src_string_len = strlen(src_string);
  if (src_string_len == 0) {
    this->init(8);
  } else {
    this->init(src_lengths ? strlen(src_lengths): src_string_len);
    length_ = reserved_;
    int offset = 0;
    for (int i = 0; i < length_; ++i) {
      int unichar_length = src_lengths ? src_lengths[i] : 1;
      unichar_ids_[i] =
          unicharset.unichar_to_id(src_string+offset, unichar_length);
      fragment_lengths_[i] = 1;
      offset += unichar_length;
    }
  }
  rating_ = src_rating;
  certainty_ = src_certainty;
  permuter_ = src_permuter;
}
Esempio n. 3
0
int Dawg::check_for_words(const char *filename,
                          const UNICHARSET &unicharset,
                          bool enable_wildcard) const {
  if (filename == nullptr) return 0;

  FILE       *word_file;
  char       string [CHARS_PER_LINE];
  int misses = 0;
  UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard);

  word_file = fopen(filename, "r");
  if (word_file == nullptr) {
    tprintf("Error: Could not open file %s\n", filename);
    ASSERT_HOST(word_file);
  }

  while (fgets (string, CHARS_PER_LINE, word_file) != nullptr) {
    chomp_string(string);  // remove newline
    WERD_CHOICE word(string, unicharset);
    if (word.length() > 0 &&
        !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
      if (!match_words(&word, 0, 0,
                       enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) {
        tprintf("Missing word: %s\n", string);
        ++misses;
      }
    } else {
      tprintf("Failed to create a valid word from %s\n", string);
    }
  }
  fclose (word_file);
  // Make sure the user sees this with fprintf instead of tprintf.
  if (debug_level_) tprintf("Number of lost words=%d\n", misses);
  return misses;
}
Esempio n. 4
0
bool UnicharAmbigs::ParseAmbiguityLine(
    int line_num, int version, int debug_level, const UNICHARSET &unicharset,
    char *buffer, int *test_ambig_part_size, UNICHAR_ID *test_unichar_ids,
    int *replacement_ambig_part_size, char *replacement_string, int *type) {
  if (version > 1) {
    // Simpler format is just wrong-string correct-string type\n.
    STRING input(buffer);
    GenericVector<STRING> fields;
    input.split(' ', &fields);
    if (fields.size() != 3) {
      if (debug_level) tprintf(kIllegalMsg, line_num);
      return false;
    }
    // Encode wrong-string.
    GenericVector<UNICHAR_ID> unichars;
    if (!unicharset.encode_string(fields[0].string(), true, &unichars, NULL,
                                  NULL)) {
      return false;
    }
    *test_ambig_part_size = unichars.size();
    if (*test_ambig_part_size > MAX_AMBIG_SIZE) {
      if (debug_level)
        tprintf("Too many unichars in ambiguity on line %d\n", line_num);
      return false;
    }
    // Copy encoded string to output.
    for (int i = 0; i < unichars.size(); ++i)
      test_unichar_ids[i] = unichars[i];
    test_unichar_ids[unichars.size()] = INVALID_UNICHAR_ID;
    // Encode replacement-string to check validity.
    if (!unicharset.encode_string(fields[1].string(), true, &unichars, NULL,
                                  NULL)) {
      return false;
    }
    *replacement_ambig_part_size = unichars.size();
    if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) {
      if (debug_level)
        tprintf("Too many unichars in ambiguity on line %d\n", line_num);
      return false;
    }
    if (sscanf(fields[2].string(), "%d", type) != 1) {
      if (debug_level) tprintf(kIllegalMsg, line_num);
      return false;
    }
    snprintf(replacement_string, kMaxAmbigStringSize, "%s", fields[1].string());
    return true;
  }
  int i;
  char *token;
  char *next_token;
  if (!(token = strtok_r(buffer, kAmbigDelimiters, &next_token)) ||
      !sscanf(token, "%d", test_ambig_part_size) ||
      *test_ambig_part_size <= 0) {
    if (debug_level) tprintf(kIllegalMsg, line_num);
    return false;
  }
  if (*test_ambig_part_size > MAX_AMBIG_SIZE) {
    if (debug_level)
      tprintf("Too many unichars in ambiguity on line %d\n", line_num);
    return false;
  }
  for (i = 0; i < *test_ambig_part_size; ++i) {
    if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break;
    if (!unicharset.contains_unichar(token)) {
      if (debug_level) tprintf(kIllegalUnicharMsg, token);
      break;
    }
    test_unichar_ids[i] = unicharset.unichar_to_id(token);
  }
  test_unichar_ids[i] = INVALID_UNICHAR_ID;

  if (i != *test_ambig_part_size ||
      !(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) ||
      !sscanf(token, "%d", replacement_ambig_part_size) ||
        *replacement_ambig_part_size <= 0) {
    if (debug_level) tprintf(kIllegalMsg, line_num);
    return false;
  }
  if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) {
    if (debug_level)
      tprintf("Too many unichars in ambiguity on line %d\n", line_num);
    return false;
  }
  replacement_string[0] = '\0';
  for (i = 0; i < *replacement_ambig_part_size; ++i) {
    if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break;
    strcat(replacement_string, token);
    if (!unicharset.contains_unichar(token)) {
      if (debug_level) tprintf(kIllegalUnicharMsg, token);
      break;
    }
  }
  if (i != *replacement_ambig_part_size) {
    if (debug_level) tprintf(kIllegalMsg, line_num);
    return false;
  }
  if (version > 0) {
    // The next field being true indicates that the abiguity should
    // always be substituted (e.g. '' should always be changed to ").
    // For such "certain" n -> m ambigs tesseract will insert character
    // fragments for the n pieces in the unicharset. AmbigsFound()
    // will then replace the incorrect ngram with the character
    // fragments of the correct character (or ngram if m > 1).
    // Note that if m > 1, an ngram will be inserted into the
    // modified word, not the individual unigrams. Tesseract
    // has limited support for ngram unichar (e.g. dawg permuter).
    if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) ||
        !sscanf(token, "%d", type)) {
      if (debug_level) tprintf(kIllegalMsg, line_num);
      return false;
    }
  }
  return true;
}
Esempio n. 5
0
bool UnicharAmbigs::ParseAmbiguityLine(
    int line_num, int version, const UNICHARSET &unicharset,
    char *buffer, int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
    int *ReplacementAmbigPartSize, char *ReplacementString, int *type) {
  int i;
  char *token;
  char *next_token;
  if (!(token = strtok_r(buffer, kAmbigDelimiters, &next_token)) ||
      !sscanf(token, "%d", TestAmbigPartSize) || TestAmbigPartSize <= 0) {
    if (global_ambigs_debug_level) tprintf(kIllegalMsg, line_num);
    return false;
  }
  if (*TestAmbigPartSize > MAX_AMBIG_SIZE) {
    tprintf("Too many unichars in ambiguity on line %d\n");
    return false;
  }
  for (i = 0; i < *TestAmbigPartSize; ++i) {
    if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break;
    if (!unicharset.contains_unichar(token)) {
      if (global_ambigs_debug_level) tprintf(kIllegalUnicharMsg, token);
      break;
    }
    TestUnicharIds[i] = unicharset.unichar_to_id(token);
  }
  TestUnicharIds[i] = INVALID_UNICHAR_ID;

  if (i != *TestAmbigPartSize ||
      !(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) ||
      !sscanf(token, "%d", ReplacementAmbigPartSize) ||
        *ReplacementAmbigPartSize <= 0) {
    if (global_ambigs_debug_level) tprintf(kIllegalMsg, line_num);
    return false;
  }
  if (*ReplacementAmbigPartSize > MAX_AMBIG_SIZE) {
    tprintf("Too many unichars in ambiguity on line %d\n");
    return false;
  }
  ReplacementString[0] = '\0';
  for (i = 0; i < *ReplacementAmbigPartSize; ++i) {
    if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break;
    strcat(ReplacementString, token);
    if (!unicharset.contains_unichar(token)) {
      if (global_ambigs_debug_level) tprintf(kIllegalUnicharMsg, token);
      break;
    }
  }
  if (i != *ReplacementAmbigPartSize) {
    if (global_ambigs_debug_level) tprintf(kIllegalMsg, line_num);
    return false;
  }
  if (version > 0) {
    // The next field being true indicates that the abiguity should
    // always be substituted (e.g. '' should always be changed to ").
    // For such "certain" n -> m ambigs tesseract will insert character
    // fragments for the n pieces in the unicharset. AmbigsFound()
    // will then replace the incorrect ngram with the character
    // fragments of the correct character (or ngram if m > 1).
    // Note that if m > 1, an ngram will be inserted into the
    // modified word, not the individual unigrams. Tesseract
    // has limited support for ngram unichar (e.g. dawg permuter).
    if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) ||
        !sscanf(token, "%d", type)) {
      if (global_ambigs_debug_level) tprintf(kIllegalMsg, line_num);
      return false;
    }
  }
  return true;
}
/** SetUpForFloat2Int **************************************************/
void SetUpForFloat2Int(
    LIST LabeledClassList)
{
  MERGE_CLASS	MergeClass;
  CLASS_TYPE		Class;
  int				NumProtos;
  int				NumConfigs;
  int				NumWords;
  int				i, j;
  float			Values[3];
  PROTO			NewProto;
  PROTO			OldProto;
  BIT_VECTOR		NewConfig;
  BIT_VECTOR		OldConfig;

  // 	printf("Float2Int ...\n");

  iterate(LabeledClassList)
  {
    UnicityTableEqEq<int>   font_set;
    MergeClass = (MERGE_CLASS) first_node (LabeledClassList);
    Class = &TrainingData[unicharset_training.unichar_to_id(
        MergeClass->Label)];
    NumProtos = MergeClass->Class->NumProtos;
    NumConfigs = MergeClass->Class->NumConfigs;
    font_set.move(&MergeClass->Class->font_set);
    Class->NumProtos = NumProtos;
    Class->MaxNumProtos = NumProtos;
    Class->Prototypes = (PROTO) Emalloc (sizeof(PROTO_STRUCT) * NumProtos);
    for(i=0; i < NumProtos; i++)
    {
      NewProto = ProtoIn(Class, i);
      OldProto = ProtoIn(MergeClass->Class, i);
      Values[0] = OldProto->X;
      Values[1] = OldProto->Y;
      Values[2] = OldProto->Angle;
      Normalize(Values);
      NewProto->X = OldProto->X;
      NewProto->Y = OldProto->Y;
      NewProto->Length = OldProto->Length;
      NewProto->Angle = OldProto->Angle;
      NewProto->A = Values[0];
      NewProto->B = Values[1];
      NewProto->C = Values[2];
    }

    Class->NumConfigs = NumConfigs;
    Class->MaxNumConfigs = NumConfigs;
    Class->font_set.move(&font_set);
    Class->Configurations = (BIT_VECTOR*) Emalloc (sizeof(BIT_VECTOR) * NumConfigs);
    NumWords = WordsInVectorOfSize(NumProtos);
    for(i=0; i < NumConfigs; i++)
    {
      NewConfig = NewBitVector(NumProtos);
      OldConfig = MergeClass->Class->Configurations[i];
      for(j=0; j < NumWords; j++)
        NewConfig[j] = OldConfig[j];
      Class->Configurations[i] = NewConfig;
    }
  }
} // SetUpForFloat2Int