/** * WERD_CHOICE::WERD_CHOICE * * Constructor to build a WERD_CHOICE from the given string. * The function assumes that src_string is not NULL. */ WERD_CHOICE::WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset) : unicharset_(&unicharset){ GenericVector<UNICHAR_ID> encoding; GenericVector<char> lengths; if (unicharset.encode_string(src_string, true, &encoding, &lengths, NULL)) { lengths.push_back('\0'); STRING src_lengths = &lengths[0]; this->init(src_string, src_lengths.string(), 0.0, 0.0, NO_PERM); } else { // There must have been an invalid unichar in the string. this->init(8); this->make_bad(); } }
bool UnicharAmbigs::ParseAmbiguityLine( int line_num, int version, int debug_level, const UNICHARSET &unicharset, char *buffer, int *test_ambig_part_size, UNICHAR_ID *test_unichar_ids, int *replacement_ambig_part_size, char *replacement_string, int *type) { if (version > 1) { // Simpler format is just wrong-string correct-string type\n. STRING input(buffer); GenericVector<STRING> fields; input.split(' ', &fields); if (fields.size() != 3) { if (debug_level) tprintf(kIllegalMsg, line_num); return false; } // Encode wrong-string. GenericVector<UNICHAR_ID> unichars; if (!unicharset.encode_string(fields[0].string(), true, &unichars, NULL, NULL)) { return false; } *test_ambig_part_size = unichars.size(); if (*test_ambig_part_size > MAX_AMBIG_SIZE) { if (debug_level) tprintf("Too many unichars in ambiguity on line %d\n", line_num); return false; } // Copy encoded string to output. for (int i = 0; i < unichars.size(); ++i) test_unichar_ids[i] = unichars[i]; test_unichar_ids[unichars.size()] = INVALID_UNICHAR_ID; // Encode replacement-string to check validity. if (!unicharset.encode_string(fields[1].string(), true, &unichars, NULL, NULL)) { return false; } *replacement_ambig_part_size = unichars.size(); if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) { if (debug_level) tprintf("Too many unichars in ambiguity on line %d\n", line_num); return false; } if (sscanf(fields[2].string(), "%d", type) != 1) { if (debug_level) tprintf(kIllegalMsg, line_num); return false; } snprintf(replacement_string, kMaxAmbigStringSize, "%s", fields[1].string()); return true; } int i; char *token; char *next_token; if (!(token = strtok_r(buffer, kAmbigDelimiters, &next_token)) || !sscanf(token, "%d", test_ambig_part_size) || *test_ambig_part_size <= 0) { if (debug_level) tprintf(kIllegalMsg, line_num); return false; } if (*test_ambig_part_size > MAX_AMBIG_SIZE) { if (debug_level) tprintf("Too many unichars in ambiguity on line %d\n", line_num); return false; } for (i = 0; i < *test_ambig_part_size; ++i) { if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break; if (!unicharset.contains_unichar(token)) { if (debug_level) tprintf(kIllegalUnicharMsg, token); break; } test_unichar_ids[i] = unicharset.unichar_to_id(token); } test_unichar_ids[i] = INVALID_UNICHAR_ID; if (i != *test_ambig_part_size || !(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) || !sscanf(token, "%d", replacement_ambig_part_size) || *replacement_ambig_part_size <= 0) { if (debug_level) tprintf(kIllegalMsg, line_num); return false; } if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) { if (debug_level) tprintf("Too many unichars in ambiguity on line %d\n", line_num); return false; } replacement_string[0] = '\0'; for (i = 0; i < *replacement_ambig_part_size; ++i) { if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break; strcat(replacement_string, token); if (!unicharset.contains_unichar(token)) { if (debug_level) tprintf(kIllegalUnicharMsg, token); break; } } if (i != *replacement_ambig_part_size) { if (debug_level) tprintf(kIllegalMsg, line_num); return false; } if (version > 0) { // The next field being true indicates that the abiguity should // always be substituted (e.g. '' should always be changed to "). // For such "certain" n -> m ambigs tesseract will insert character // fragments for the n pieces in the unicharset. AmbigsFound() // will then replace the incorrect ngram with the character // fragments of the correct character (or ngram if m > 1). // Note that if m > 1, an ngram will be inserted into the // modified word, not the individual unigrams. Tesseract // has limited support for ngram unichar (e.g. dawg permuter). if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) || !sscanf(token, "%d", type)) { if (debug_level) tprintf(kIllegalMsg, line_num); return false; } } return true; }