void UNICHARSET::unichar_insert(const char* const unichar_repr) { if (!ids.contains(unichar_repr)) { if (strlen(unichar_repr) > UNICHAR_LEN) { fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n", int(strlen(unichar_repr)), unichar_repr); return; } if (size_used == size_reserved) { if (size_used == 0) reserve(8); else reserve(2 * size_used); } strcpy(unichars[size_used].representation, unichar_repr); this->set_script(size_used, null_script); // If the given unichar_repr represents a fragmented character, set // fragment property to a pointer to CHAR_FRAGMENT class instance with // information parsed from the unichar representation. Use the script // of the base unichar for the fragmented character if possible. CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr); this->unichars[size_used].properties.fragment = frag; if (frag != NULL && this->contains_unichar(frag->get_unichar())) { this->unichars[size_used].properties.script_id = this->get_script(frag->get_unichar()); } this->unichars[size_used].properties.enabled = true; ids.insert(unichar_repr, size_used); ++size_used; } }
// Creates a fake blob choice from the combination of the given fragments. // unichar is the class to be made from the combination, // expanded_fragment_lengths[choice_index] is the number of fragments to use. // old_choices[choice_index] has the classifier output for each fragment. // choice index initially indexes the last fragment and should be decremented // expanded_fragment_lengths[choice_index] times to get the earlier fragments. // Guarantees to return something non-null, or abort! BLOB_CHOICE* Wordrec::rebuild_fragments( const char* unichar, const char* expanded_fragment_lengths, int choice_index, BLOB_CHOICE_LIST_VECTOR *old_choices) { float rating = 0.0f; float certainty = 0.0f; inT16 min_xheight = -MAX_INT16; inT16 max_xheight = MAX_INT16; for (int fragment_pieces = expanded_fragment_lengths[choice_index] - 1; fragment_pieces >= 0; --fragment_pieces, --choice_index) { // Get a pointer to the classifier results from the old_choices. BLOB_CHOICE_LIST *current_choices = old_choices->get(choice_index); // Populate fragment with updated values and look for the // fragment with the same values in current_choices. // Update rating and certainty of the character being composed. CHAR_FRAGMENT fragment; fragment.set_all(unichar, fragment_pieces, expanded_fragment_lengths[choice_index], false); BLOB_CHOICE_IT choice_it(current_choices); for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { BLOB_CHOICE* choice = choice_it.data(); const CHAR_FRAGMENT *current_fragment = getDict().getUnicharset().get_fragment(choice->unichar_id()); if (current_fragment && fragment.equals(current_fragment)) { rating += choice->rating(); if (choice->certainty() < certainty) { certainty = choice->certainty(); } IntersectRange(choice->min_xheight(), choice->max_xheight(), &min_xheight, &max_xheight); break; } } if (choice_it.cycled_list()) { print_ratings_list("Failure", current_choices, unicharset); tprintf("Failed to find fragment %s at index=%d\n", fragment.to_string().string(), choice_index); } ASSERT_HOST(!choice_it.cycled_list()); // Be sure we found the fragment. } return new BLOB_CHOICE(getDict().getUnicharset().unichar_to_id(unichar), rating, certainty, -1, -1, 0, min_xheight, max_xheight, false); }
void UNICHARSET::unichar_insert(const char* const unichar_repr, OldUncleanUnichars old_style) { if (old_style == OldUncleanUnichars::kTrue) old_style_included_ = true; std::string cleaned = old_style_included_ ? unichar_repr : CleanupString(unichar_repr); if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) { const char* str = cleaned.c_str(); GenericVector<int> encoding; if (!old_style_included_ && encode_string(str, true, &encoding, nullptr, nullptr)) return; if (size_used == size_reserved) { if (size_used == 0) reserve(8); else reserve(2 * size_used); } int index = 0; do { if (index >= UNICHAR_LEN) { fprintf(stderr, "Utf8 buffer too big, size>%d for %s\n", UNICHAR_LEN, unichar_repr); return; } unichars[size_used].representation[index++] = *str++; } while (*str != '\0'); unichars[size_used].representation[index] = '\0'; this->set_script(size_used, null_script); // If the given unichar_repr represents a fragmented character, set // fragment property to a pointer to CHAR_FRAGMENT class instance with // information parsed from the unichar representation. Use the script // of the base unichar for the fragmented character if possible. CHAR_FRAGMENT* frag = CHAR_FRAGMENT::parse_from_string(unichars[size_used].representation); this->unichars[size_used].properties.fragment = frag; if (frag != nullptr && this->contains_unichar(frag->get_unichar())) { this->unichars[size_used].properties.script_id = this->get_script(frag->get_unichar()); } this->unichars[size_used].properties.enabled = true; ids.insert(unichars[size_used].representation, size_used); ++size_used; } }
CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) { const char *ptr = string; int len = strlen(string); if (len < kMinLen || *ptr != kSeparator) { return NULL; // this string can not represent a fragment } ptr++; // move to the next character int step = 0; while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) { step += UNICHAR::utf8_step(ptr + step); } if (step == 0 || step > UNICHAR_LEN) { return NULL; // no character for unichar or the character is too long } char unichar[UNICHAR_LEN + 1]; strncpy(unichar, ptr, step); unichar[step] = '\0'; // null terminate unichar ptr += step; // move to the next fragment separator int pos = 0; int total = 0; bool natural = false; char *end_ptr = NULL; for (int i = 0; i < 2; i++) { if (ptr > string + len || *ptr != kSeparator) { if (i == 1 && *ptr == kNaturalFlag) natural = true; else return NULL; // Failed to parse fragment representation. } ptr++; // move to the next character i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10)) : total = static_cast<int>(strtol(ptr, &end_ptr, 10)); ptr = end_ptr; } if (ptr != string + len) { return NULL; // malformed fragment representation } CHAR_FRAGMENT *fragment = new CHAR_FRAGMENT(); fragment->set_all(unichar, pos, total, natural); return fragment; }
bool UNICHARSET::load_via_fgets( TessResultCallback2<char *, char *, int> *fgets_cb, bool skip_fragments) { int unicharset_size; char buffer[256]; this->clear(); if (fgets_cb->Run(buffer, sizeof(buffer)) == NULL || sscanf(buffer, "%d", &unicharset_size) != 1) { return false; } this->reserve(unicharset_size); for (UNICHAR_ID id = 0; id < unicharset_size; ++id) { char unichar[256]; unsigned int properties; char script[64]; strcpy(script, null_script); int min_bottom = 0; int max_bottom = MAX_UINT8; int min_top = 0; int max_top = MAX_UINT8; int min_width = 0; int max_width = MAX_INT16; int min_bearing = 0; int max_bearing = MAX_INT16; int min_advance = 0; int max_advance = MAX_INT16; // TODO(eger): check that this default it ok // after enabling BiDi iterator for Arabic+Cube. int direction = UNICHARSET::U_LEFT_TO_RIGHT; UNICHAR_ID other_case = id; UNICHAR_ID mirror = id; char normed[64]; int v = -1; if (fgets_cb->Run(buffer, sizeof (buffer)) == NULL || ((v = sscanf(buffer, "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %63s %d %d %d %63s", unichar, &properties, &min_bottom, &max_bottom, &min_top, &max_top, &min_width, &max_width, &min_bearing, &max_bearing, &min_advance, &max_advance, script, &other_case, &direction, &mirror, normed)) != 17 && (v = sscanf(buffer, "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %63s %d %d %d", unichar, &properties, &min_bottom, &max_bottom, &min_top, &max_top, &min_width, &max_width, &min_bearing, &max_bearing, &min_advance, &max_advance, script, &other_case, &direction, &mirror)) != 16 && (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d %d %d", unichar, &properties, &min_bottom, &max_bottom, &min_top, &max_top, script, &other_case, &direction, &mirror)) != 10 && (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d", unichar, &properties, &min_bottom, &max_bottom, &min_top, &max_top, script, &other_case)) != 8 && (v = sscanf(buffer, "%s %x %63s %d", unichar, &properties, script, &other_case)) != 4 && (v = sscanf(buffer, "%s %x %63s", unichar, &properties, script)) != 3 && (v = sscanf(buffer, "%s %x", unichar, &properties)) != 2)) { return false; } // Skip fragments if needed. CHAR_FRAGMENT *frag = NULL; if (skip_fragments && (frag = CHAR_FRAGMENT::parse_from_string(unichar))) { int num_pieces = frag->get_total(); delete frag; // Skip multi-element fragments, but keep singles like UNICHAR_BROKEN in. if (num_pieces > 1) continue; } // Insert unichar into unicharset and set its properties. if (strcmp(unichar, "NULL") == 0) this->unichar_insert(" "); else this->unichar_insert(unichar); this->set_isalpha(id, properties & ISALPHA_MASK); this->set_islower(id, properties & ISLOWER_MASK); this->set_isupper(id, properties & ISUPPER_MASK); this->set_isdigit(id, properties & ISDIGIT_MASK); this->set_ispunctuation(id, properties & ISPUNCTUATION_MASK); this->set_isngram(id, false); this->set_script(id, script); this->unichars[id].properties.enabled = true; this->set_top_bottom(id, min_bottom, max_bottom, min_top, max_top); this->set_width_range(id, min_width, max_width); this->set_bearing_range(id, min_bearing, max_bearing); this->set_advance_range(id, min_advance, max_advance); this->set_direction(id, static_cast<UNICHARSET::Direction>(direction)); ASSERT_HOST(other_case < unicharset_size); this->set_other_case(id, (v>3) ? other_case : id); ASSERT_HOST(mirror < unicharset_size); this->set_mirror(id, (v>8) ? mirror : id); this->set_normed(id, (v>16) ? normed : unichar); } post_load_setup(); return true; }
/** * rebuild_current_state * * Evaluate the segmentation that is represented by this state in the * best first search. Add this state to the "states_seen" list. */ BLOB_CHOICE_LIST_VECTOR *Wordrec::rebuild_current_state( TBLOB *blobs, SEAMS seam_list, STATE *state, BLOB_CHOICE_LIST_VECTOR *old_choices, int fx, bool force_rebuild, const WERD_CHOICE &best_choice, const MATRIX *ratings) { // Initialize search_state, num_joints, x, y. int num_joints = array_count(seam_list); #ifndef GRAPHICS_DISABLED if (wordrec_display_segmentations) { print_state("Rebuiling state", state, num_joints); } #endif SEARCH_STATE search_state = bin_to_chunks(state, num_joints); int x = 0; int y; int i; for (i = 1; i <= search_state[0]; i++) { y = x + search_state[i]; x = y + 1; } y = count_blobs (blobs) - 1; // Initialize char_choices, expanded_fragment_lengths: // e.g. if fragment_lengths = {1 1 2 3 1}, // expanded_fragment_lengths_str = {1 1 2 2 3 3 3 1}. BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR(); STRING expanded_fragment_lengths_str = ""; bool state_has_fragments = false; const char *fragment_lengths = NULL; if (best_choice.length() > 0) { fragment_lengths = best_choice.fragment_lengths(); } if (fragment_lengths) { for (int i = 0; i < best_choice.length(); ++i) { *char_choices += NULL; if (fragment_lengths[i] > 1) { state_has_fragments = true; } for (int j = 0; j < fragment_lengths[i]; ++j) { expanded_fragment_lengths_str += fragment_lengths[i]; } } } else { for (i = 0; i <= search_state[0]; ++i) { expanded_fragment_lengths_str += (char)1; *char_choices += NULL; } } // Finish early if force_rebuld is false and there are no fragments to merge. if (!force_rebuild && !state_has_fragments) { delete char_choices; memfree(search_state); return old_choices; } // Set up variables for concatenating fragments. const char *word_lengths_ptr = NULL; const char *word_ptr = NULL; if (state_has_fragments) { // Make word_lengths_ptr point to the last element in // best_choice->unichar_lengths(). word_lengths_ptr = best_choice.unichar_lengths().string(); word_lengths_ptr += (strlen(word_lengths_ptr)-1); // Make word_str point to the beginning of the last // unichar in best_choice->unichar_string(). word_ptr = best_choice.unichar_string().string(); word_ptr += (strlen(word_ptr)-*word_lengths_ptr); } const char *expanded_fragment_lengths = expanded_fragment_lengths_str.string(); bool merging_fragment = false; int true_y = -1; char unichar[UNICHAR_LEN + 1]; int fragment_pieces = -1; float rating = 0.0; float certainty = -MAX_FLOAT32; // Populate char_choices list such that it corresponds to search_state. // // If we are rebuilding a state that contains character fragments: // -- combine blobs that belong to character fragments // -- re-classify the blobs to obtain choices list for the merged blob // -- ensure that correct classification appears in the new choices list // NOTE: a choice composed form original fragment choices will be always // added to the new choices list for each character composed from // fragments (even if the choice for the corresponding character appears // in the re-classified choices list of for the newly merged blob). BLOB_CHOICE_IT temp_it; int char_choices_index = char_choices->length() - 1; for (i = search_state[0]; i >= 0; i--) { BLOB_CHOICE_LIST *current_choices = join_blobs_and_classify( blobs, seam_list, x, y, fx, ratings, old_choices); // Combine character fragments. if (expanded_fragment_lengths[i] > 1) { // Start merging character fragments. if (!merging_fragment) { merging_fragment = true; true_y = y; fragment_pieces = expanded_fragment_lengths[i]; rating = 0.0; certainty = -MAX_FLOAT32; strncpy(unichar, word_ptr, *word_lengths_ptr); unichar[*word_lengths_ptr] = '\0'; } // Take into account the fact that we could have joined pieces // since we first recorded the ending point of a fragment (true_y). true_y -= y - x; // Populate fragment with updated values and look for the // fragment with the same values in current_choices. // Update rating and certainty of the character being composed. fragment_pieces--; CHAR_FRAGMENT fragment; fragment.set_all(unichar, fragment_pieces, expanded_fragment_lengths[i]); temp_it.set_to_list(current_choices); for (temp_it.mark_cycle_pt(); !temp_it.cycled_list(); temp_it.forward()) { const CHAR_FRAGMENT *current_fragment = getDict().getUnicharset().get_fragment(temp_it.data()->unichar_id()); if (current_fragment && fragment.equals(current_fragment)) { rating += temp_it.data()->rating(); if (temp_it.data()->certainty() > certainty) { certainty = temp_it.data()->certainty(); } break; } } assert(!temp_it.cycled_list()); // make sure we found the fragment // Free current_choices for the fragmented character. delete current_choices; // Finish composing character from fragments. if (fragment_pieces == 0) { // Populate current_choices with the classification of // the blob merged from blobs of each character fragment. current_choices = join_blobs_and_classify(blobs, seam_list, x, true_y, fx, ratings, NULL); BLOB_CHOICE *merged_choice = new BLOB_CHOICE(getDict().getUnicharset().unichar_to_id(unichar), rating, certainty, 0, NO_PERM); // Insert merged_blob into current_choices, such that current_choices // are still sorted in non-descending order by rating. ASSERT_HOST(!current_choices->empty()); temp_it.set_to_list(current_choices); for (temp_it.mark_cycle_pt(); !temp_it.cycled_list() && merged_choice->rating() > temp_it.data()->rating(); temp_it.forward()); temp_it.add_before_stay_put(merged_choice); // Done merging this fragmented character. merging_fragment = false; } } if (!merging_fragment) { // Get rid of fragments in current_choices. temp_it.set_to_list(current_choices); for (temp_it.mark_cycle_pt(); !temp_it.cycled_list(); temp_it.forward()) { if (getDict().getUnicharset().get_fragment( temp_it.data()->unichar_id())) { delete temp_it.extract(); } } char_choices->set(current_choices, char_choices_index); char_choices_index--; // Update word_ptr and word_lengths_ptr. if (word_lengths_ptr != NULL && word_ptr != NULL) { word_lengths_ptr--; word_ptr -= (*word_lengths_ptr); } } y = x - 1; x = y - search_state[i]; } old_choices->delete_data_pointers(); delete old_choices; memfree(search_state); return (char_choices); }