UNICHAR_ID wc_to_unichar_id(const UNICHARSET &unicharset, int wc) { UNICHAR uch(wc); char *unichar = uch.utf8_str(); UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar); delete[] unichar; return unichar_id; }
/** * WERD_CHOICE::init * * Helper function to build a WERD_CHOICE from the given string, * fragment lengths, rating, certainty and permuter. * * The function assumes that src_string is not NULL. * src_lengths argument could be NULL, in which case the unichars * in src_string are assumed to all be of length 1. */ void WERD_CHOICE::init(const char *src_string, const char *src_lengths, float src_rating, float src_certainty, uinT8 src_permuter, const UNICHARSET &unicharset) { int src_string_len = strlen(src_string); if (src_string_len == 0) { this->init(8); } else { this->init(src_lengths ? strlen(src_lengths): src_string_len); length_ = reserved_; int offset = 0; for (int i = 0; i < length_; ++i) { int unichar_length = src_lengths ? src_lengths[i] : 1; unichar_ids_[i] = unicharset.unichar_to_id(src_string+offset, unichar_length); fragment_lengths_[i] = 1; offset += unichar_length; } } rating_ = src_rating; certainty_ = src_certainty; permuter_ = src_permuter; }
int Dawg::check_for_words(const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const { if (filename == nullptr) return 0; FILE *word_file; char string [CHARS_PER_LINE]; int misses = 0; UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard); word_file = fopen(filename, "r"); if (word_file == nullptr) { tprintf("Error: Could not open file %s\n", filename); ASSERT_HOST(word_file); } while (fgets (string, CHARS_PER_LINE, word_file) != nullptr) { chomp_string(string); // remove newline WERD_CHOICE word(string, unicharset); if (word.length() > 0 && !word.contains_unichar_id(INVALID_UNICHAR_ID)) { if (!match_words(&word, 0, 0, enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) { tprintf("Missing word: %s\n", string); ++misses; } } else { tprintf("Failed to create a valid word from %s\n", string); } } fclose (word_file); // Make sure the user sees this with fprintf instead of tprintf. if (debug_level_) tprintf("Number of lost words=%d\n", misses); return misses; }
bool UnicharAmbigs::ParseAmbiguityLine( int line_num, int version, int debug_level, const UNICHARSET &unicharset, char *buffer, int *test_ambig_part_size, UNICHAR_ID *test_unichar_ids, int *replacement_ambig_part_size, char *replacement_string, int *type) { if (version > 1) { // Simpler format is just wrong-string correct-string type\n. STRING input(buffer); GenericVector<STRING> fields; input.split(' ', &fields); if (fields.size() != 3) { if (debug_level) tprintf(kIllegalMsg, line_num); return false; } // Encode wrong-string. GenericVector<UNICHAR_ID> unichars; if (!unicharset.encode_string(fields[0].string(), true, &unichars, NULL, NULL)) { return false; } *test_ambig_part_size = unichars.size(); if (*test_ambig_part_size > MAX_AMBIG_SIZE) { if (debug_level) tprintf("Too many unichars in ambiguity on line %d\n", line_num); return false; } // Copy encoded string to output. for (int i = 0; i < unichars.size(); ++i) test_unichar_ids[i] = unichars[i]; test_unichar_ids[unichars.size()] = INVALID_UNICHAR_ID; // Encode replacement-string to check validity. if (!unicharset.encode_string(fields[1].string(), true, &unichars, NULL, NULL)) { return false; } *replacement_ambig_part_size = unichars.size(); if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) { if (debug_level) tprintf("Too many unichars in ambiguity on line %d\n", line_num); return false; } if (sscanf(fields[2].string(), "%d", type) != 1) { if (debug_level) tprintf(kIllegalMsg, line_num); return false; } snprintf(replacement_string, kMaxAmbigStringSize, "%s", fields[1].string()); return true; } int i; char *token; char *next_token; if (!(token = strtok_r(buffer, kAmbigDelimiters, &next_token)) || !sscanf(token, "%d", test_ambig_part_size) || *test_ambig_part_size <= 0) { if (debug_level) tprintf(kIllegalMsg, line_num); return false; } if (*test_ambig_part_size > MAX_AMBIG_SIZE) { if (debug_level) tprintf("Too many unichars in ambiguity on line %d\n", line_num); return false; } for (i = 0; i < *test_ambig_part_size; ++i) { if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break; if (!unicharset.contains_unichar(token)) { if (debug_level) tprintf(kIllegalUnicharMsg, token); break; } test_unichar_ids[i] = unicharset.unichar_to_id(token); } test_unichar_ids[i] = INVALID_UNICHAR_ID; if (i != *test_ambig_part_size || !(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) || !sscanf(token, "%d", replacement_ambig_part_size) || *replacement_ambig_part_size <= 0) { if (debug_level) tprintf(kIllegalMsg, line_num); return false; } if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) { if (debug_level) tprintf("Too many unichars in ambiguity on line %d\n", line_num); return false; } replacement_string[0] = '\0'; for (i = 0; i < *replacement_ambig_part_size; ++i) { if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break; strcat(replacement_string, token); if (!unicharset.contains_unichar(token)) { if (debug_level) tprintf(kIllegalUnicharMsg, token); break; } } if (i != *replacement_ambig_part_size) { if (debug_level) tprintf(kIllegalMsg, line_num); return false; } if (version > 0) { // The next field being true indicates that the abiguity should // always be substituted (e.g. '' should always be changed to "). // For such "certain" n -> m ambigs tesseract will insert character // fragments for the n pieces in the unicharset. AmbigsFound() // will then replace the incorrect ngram with the character // fragments of the correct character (or ngram if m > 1). // Note that if m > 1, an ngram will be inserted into the // modified word, not the individual unigrams. Tesseract // has limited support for ngram unichar (e.g. dawg permuter). if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) || !sscanf(token, "%d", type)) { if (debug_level) tprintf(kIllegalMsg, line_num); return false; } } return true; }
bool UnicharAmbigs::ParseAmbiguityLine( int line_num, int version, const UNICHARSET &unicharset, char *buffer, int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds, int *ReplacementAmbigPartSize, char *ReplacementString, int *type) { int i; char *token; char *next_token; if (!(token = strtok_r(buffer, kAmbigDelimiters, &next_token)) || !sscanf(token, "%d", TestAmbigPartSize) || TestAmbigPartSize <= 0) { if (global_ambigs_debug_level) tprintf(kIllegalMsg, line_num); return false; } if (*TestAmbigPartSize > MAX_AMBIG_SIZE) { tprintf("Too many unichars in ambiguity on line %d\n"); return false; } for (i = 0; i < *TestAmbigPartSize; ++i) { if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break; if (!unicharset.contains_unichar(token)) { if (global_ambigs_debug_level) tprintf(kIllegalUnicharMsg, token); break; } TestUnicharIds[i] = unicharset.unichar_to_id(token); } TestUnicharIds[i] = INVALID_UNICHAR_ID; if (i != *TestAmbigPartSize || !(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) || !sscanf(token, "%d", ReplacementAmbigPartSize) || *ReplacementAmbigPartSize <= 0) { if (global_ambigs_debug_level) tprintf(kIllegalMsg, line_num); return false; } if (*ReplacementAmbigPartSize > MAX_AMBIG_SIZE) { tprintf("Too many unichars in ambiguity on line %d\n"); return false; } ReplacementString[0] = '\0'; for (i = 0; i < *ReplacementAmbigPartSize; ++i) { if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break; strcat(ReplacementString, token); if (!unicharset.contains_unichar(token)) { if (global_ambigs_debug_level) tprintf(kIllegalUnicharMsg, token); break; } } if (i != *ReplacementAmbigPartSize) { if (global_ambigs_debug_level) tprintf(kIllegalMsg, line_num); return false; } if (version > 0) { // The next field being true indicates that the abiguity should // always be substituted (e.g. '' should always be changed to "). // For such "certain" n -> m ambigs tesseract will insert character // fragments for the n pieces in the unicharset. AmbigsFound() // will then replace the incorrect ngram with the character // fragments of the correct character (or ngram if m > 1). // Note that if m > 1, an ngram will be inserted into the // modified word, not the individual unigrams. Tesseract // has limited support for ngram unichar (e.g. dawg permuter). if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) || !sscanf(token, "%d", type)) { if (global_ambigs_debug_level) tprintf(kIllegalMsg, line_num); return false; } } return true; }
/** SetUpForFloat2Int **************************************************/ void SetUpForFloat2Int( LIST LabeledClassList) { MERGE_CLASS MergeClass; CLASS_TYPE Class; int NumProtos; int NumConfigs; int NumWords; int i, j; float Values[3]; PROTO NewProto; PROTO OldProto; BIT_VECTOR NewConfig; BIT_VECTOR OldConfig; // printf("Float2Int ...\n"); iterate(LabeledClassList) { UnicityTableEqEq<int> font_set; MergeClass = (MERGE_CLASS) first_node (LabeledClassList); Class = &TrainingData[unicharset_training.unichar_to_id( MergeClass->Label)]; NumProtos = MergeClass->Class->NumProtos; NumConfigs = MergeClass->Class->NumConfigs; font_set.move(&MergeClass->Class->font_set); Class->NumProtos = NumProtos; Class->MaxNumProtos = NumProtos; Class->Prototypes = (PROTO) Emalloc (sizeof(PROTO_STRUCT) * NumProtos); for(i=0; i < NumProtos; i++) { NewProto = ProtoIn(Class, i); OldProto = ProtoIn(MergeClass->Class, i); Values[0] = OldProto->X; Values[1] = OldProto->Y; Values[2] = OldProto->Angle; Normalize(Values); NewProto->X = OldProto->X; NewProto->Y = OldProto->Y; NewProto->Length = OldProto->Length; NewProto->Angle = OldProto->Angle; NewProto->A = Values[0]; NewProto->B = Values[1]; NewProto->C = Values[2]; } Class->NumConfigs = NumConfigs; Class->MaxNumConfigs = NumConfigs; Class->font_set.move(&font_set); Class->Configurations = (BIT_VECTOR*) Emalloc (sizeof(BIT_VECTOR) * NumConfigs); NumWords = WordsInVectorOfSize(NumProtos); for(i=0; i < NumConfigs; i++) { NewConfig = NewBitVector(NumProtos); OldConfig = MergeClass->Class->Configurations[i]; for(j=0; j < NumWords; j++) NewConfig[j] = OldConfig[j]; Class->Configurations[i] = NewConfig; } } } // SetUpForFloat2Int