// Creates a fake blob choice from the combination of the given fragments. // unichar is the class to be made from the combination, // expanded_fragment_lengths[choice_index] is the number of fragments to use. // old_choices[choice_index] has the classifier output for each fragment. // choice index initially indexes the last fragment and should be decremented // expanded_fragment_lengths[choice_index] times to get the earlier fragments. // Guarantees to return something non-null, or abort! BLOB_CHOICE* Wordrec::rebuild_fragments( const char* unichar, const char* expanded_fragment_lengths, int choice_index, BLOB_CHOICE_LIST_VECTOR *old_choices) { float rating = 0.0f; float certainty = 0.0f; inT16 min_xheight = -MAX_INT16; inT16 max_xheight = MAX_INT16; for (int fragment_pieces = expanded_fragment_lengths[choice_index] - 1; fragment_pieces >= 0; --fragment_pieces, --choice_index) { // Get a pointer to the classifier results from the old_choices. BLOB_CHOICE_LIST *current_choices = old_choices->get(choice_index); // Populate fragment with updated values and look for the // fragment with the same values in current_choices. // Update rating and certainty of the character being composed. CHAR_FRAGMENT fragment; fragment.set_all(unichar, fragment_pieces, expanded_fragment_lengths[choice_index], false); BLOB_CHOICE_IT choice_it(current_choices); for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { BLOB_CHOICE* choice = choice_it.data(); const CHAR_FRAGMENT *current_fragment = getDict().getUnicharset().get_fragment(choice->unichar_id()); if (current_fragment && fragment.equals(current_fragment)) { rating += choice->rating(); if (choice->certainty() < certainty) { certainty = choice->certainty(); } IntersectRange(choice->min_xheight(), choice->max_xheight(), &min_xheight, &max_xheight); break; } } if (choice_it.cycled_list()) { print_ratings_list("Failure", current_choices, unicharset); tprintf("Failed to find fragment %s at index=%d\n", fragment.to_string().string(), choice_index); } ASSERT_HOST(!choice_it.cycled_list()); // Be sure we found the fragment. } return new BLOB_CHOICE(getDict().getUnicharset().unichar_to_id(unichar), rating, certainty, -1, -1, 0, min_xheight, max_xheight, false); }
void LMConsistencyInfo::ComputeXheightConsistency( const BLOB_CHOICE *b, bool is_punc) { if (xht_decision == XH_INCONSISTENT) return; // It isn't going to get any better. // Compute xheight consistency. bool parent_null = xht_sp < 0; int parent_sp = xht_sp; // Debug strings. if (b->yshift() > LMConsistencyInfo::kShiftThresh) { xht_sp = LMConsistencyInfo::kSUP; } else if (b->yshift() < -LMConsistencyInfo::kShiftThresh) { xht_sp = LMConsistencyInfo::kSUB; } else { xht_sp = LMConsistencyInfo::kNORM; } xht_count[xht_sp]++; if (is_punc) xht_count_punc[xht_sp]++; if (!parent_null) { xpos_entropy += abs(parent_sp - xht_sp); } // TODO(eger): Figure out a better way to account for small caps. // For the first character not y-shifted, we only care if it is too small. // Too large is common in drop caps and small caps. // inT16 small_xht = b->min_xheight(); // if (parent_vse == NULL && sp == LanguageModelConsistencyInfo::kNORM) { // small_xht = 0; // } IntersectRange(b->min_xheight(), b->max_xheight(), &(xht_lo[xht_sp]), &(xht_hi[xht_sp])); // Compute xheight inconsistency kinds. if (parent_null) { if (xht_count[kNORM] == 1) { xht_decision = XH_GOOD; } else { xht_decision = XH_SUBNORMAL; } return; } // When we intersect the ranges of xheights in pixels for all characters in // each position (subscript, normal, superscript), // How much range must be left? 0? [exactly one pixel height for xheight] 1? // TODO(eger): Extend this code to take a prior for the rest of the line. const int kMinIntersectedXHeightRange = 0; for (int i = 0; i < kNumPos; i++) { if (xht_lo[i] > xht_hi[i] - kMinIntersectedXHeightRange) { xht_decision = XH_INCONSISTENT; return; } } // Reject as improbable anything where there's much punctuation in subscript // or superscript regions. if (xht_count_punc[kSUB] > xht_count[kSUB] * 0.4 || xht_count_punc[kSUP] > xht_count[kSUP] * 0.4) { xht_decision = XH_INCONSISTENT; return; } // Now check that the subscript and superscript aren't too small relative to // the mainline. double mainline_xht = static_cast<double>(xht_lo[kNORM]); double kMinSizeRatio = 0.4; if (mainline_xht > 0.0 && (static_cast<double>(xht_hi[kSUB]) / mainline_xht < kMinSizeRatio || static_cast<double>(xht_hi[kSUP]) / mainline_xht < kMinSizeRatio)) { xht_decision = XH_INCONSISTENT; return; } // TODO(eger): Check into inconsistency of super/subscript y offsets. if (xpos_entropy > kMaxEntropy) { xht_decision = XH_INCONSISTENT; return; } if (xht_count[kSUB] == 0 && xht_count[kSUP] == 0) { xht_decision = XH_GOOD; return; } xht_decision = XH_SUBNORMAL; }
/********************************************************************** * merge_and_put_fragment_lists * * Merge the fragment lists in choice_lists and append it to the * ratings matrix. **********************************************************************/ void Wordrec::merge_and_put_fragment_lists(inT16 row, inT16 column, inT16 num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings) { BLOB_CHOICE_IT *choice_lists_it = new BLOB_CHOICE_IT[num_frag_parts]; for (int i = 0; i < num_frag_parts; i++) { choice_lists_it[i].set_to_list(&choice_lists[i]); choice_lists_it[i].mark_cycle_pt(); } BLOB_CHOICE_LIST *merged_choice = ratings->get(row, column); if (merged_choice == NULL) merged_choice = new BLOB_CHOICE_LIST; bool end_of_list = false; BLOB_CHOICE_IT merged_choice_it(merged_choice); while (!end_of_list) { // Find the maximum unichar_id of the current entry the iterators // are pointing at UNICHAR_ID max_unichar_id = choice_lists_it[0].data()->unichar_id(); for (int i = 0; i < num_frag_parts; i++) { UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id(); if (max_unichar_id < unichar_id) { max_unichar_id = unichar_id; } } // Move the each iterators until it gets to an entry that has a // value greater than or equal to max_unichar_id for (int i = 0; i < num_frag_parts; i++) { UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id(); while (!choice_lists_it[i].cycled_list() && unichar_id < max_unichar_id) { choice_lists_it[i].forward(); unichar_id = choice_lists_it[i].data()->unichar_id(); } if (choice_lists_it[i].cycled_list()) { end_of_list = true; break; } } if (end_of_list) break; // Checks if the fragments are parts of the same character UNICHAR_ID first_unichar_id = choice_lists_it[0].data()->unichar_id(); bool same_unichar = true; for (int i = 1; i < num_frag_parts; i++) { UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id(); if (unichar_id != first_unichar_id) { same_unichar = false; break; } } if (same_unichar) { // Add the merged character to the result UNICHAR_ID merged_unichar_id = first_unichar_id; inT16 merged_fontinfo_id = choice_lists_it[0].data()->fontinfo_id(); inT16 merged_fontinfo_id2 = choice_lists_it[0].data()->fontinfo_id2(); float merged_min_xheight = choice_lists_it[0].data()->min_xheight(); float merged_max_xheight = choice_lists_it[0].data()->max_xheight(); float positive_yshift = 0, negative_yshift = 0; int merged_script_id = choice_lists_it[0].data()->script_id(); BlobChoiceClassifier classifier = choice_lists_it[0].data()->classifier(); float merged_rating = 0, merged_certainty = 0; for (int i = 0; i < num_frag_parts; i++) { float rating = choice_lists_it[i].data()->rating(); float certainty = choice_lists_it[i].data()->certainty(); if (i == 0 || certainty < merged_certainty) merged_certainty = certainty; merged_rating += rating; choice_lists_it[i].forward(); if (choice_lists_it[i].cycled_list()) end_of_list = true; IntersectRange(choice_lists_it[i].data()->min_xheight(), choice_lists_it[i].data()->max_xheight(), &merged_min_xheight, &merged_max_xheight); float yshift = choice_lists_it[i].data()->yshift(); if (yshift > positive_yshift) positive_yshift = yshift; if (yshift < negative_yshift) negative_yshift = yshift; } float merged_yshift = positive_yshift != 0 ? (negative_yshift != 0 ? 0 : positive_yshift) : negative_yshift; merged_choice_it.add_to_end(new BLOB_CHOICE(merged_unichar_id, merged_rating, merged_certainty, merged_fontinfo_id, merged_fontinfo_id2, merged_script_id, merged_min_xheight, merged_max_xheight, merged_yshift, classifier)); } } if (classify_debug_level) print_ratings_list("Merged Fragments", merged_choice, unicharset); if (merged_choice->empty()) delete merged_choice; else ratings->put(row, column, merged_choice); delete [] choice_lists_it; }