// Print the best guesses out of the match rating matrix. void MATRIX::print(const UNICHARSET &unicharset) const { tprintf("Ratings Matrix (top 3 choices)\n"); int dim = dimension(); int band_width = bandwidth(); int row, col; for (col = 0; col < dim; ++col) { for (row = col; row < dim && row < col + band_width; ++row) { BLOB_CHOICE_LIST *rating = this->get(col, row); if (rating == NOT_CLASSIFIED) continue; BLOB_CHOICE_IT b_it(rating); tprintf("col=%d row=%d ", col, row); for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { tprintf("%s rat=%g cert=%g " , unicharset.id_to_unichar(b_it.data()->unichar_id()), b_it.data()->rating(), b_it.data()->certainty()); } tprintf("\n"); } tprintf("\n"); } tprintf("\n"); for (col = 0; col < dim; ++col) tprintf("\t%d", col); tprintf("\n"); for (row = 0; row < dim; ++row) { for (col = 0; col <= row; ++col) { if (col == 0) tprintf("%d\t", row); if (row >= col + band_width) { tprintf(" \t"); continue; } BLOB_CHOICE_LIST *rating = this->get(col, row); if (rating != NOT_CLASSIFIED) { BLOB_CHOICE_IT b_it(rating); int counter = 0; for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { tprintf("%s ", unicharset.id_to_unichar(b_it.data()->unichar_id())); ++counter; if (counter == 3) break; } tprintf("\t"); } else { tprintf(" \t"); } } tprintf("\n"); } }
// Print the best guesses out of the match rating matrix. void MATRIX::print(const UNICHARSET &unicharset) { tprintf("Ratings Matrix (top choices)\n"); int row, col; for (col = 0; col < this->dimension(); ++col) tprintf("\t%d", col); tprintf("\n"); for (row = 0; row < this->dimension(); ++row) { for (col = 0; col <= row; ++col) { if (col == 0) tprintf("%d\t", row); BLOB_CHOICE_LIST *rating = this->get(col, row); if (rating != NOT_CLASSIFIED) { BLOB_CHOICE_IT b_it(rating); int counter = 0; for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { tprintf("%s ", unicharset.id_to_unichar(b_it.data()->unichar_id())); ++counter; if (counter == 3) break; } tprintf("\t"); } else { tprintf(" \t"); } } tprintf("\n"); } }
/// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: /// All fuzzy spaces are removed, and all the words are maximally chopped. PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes, BLOCK_LIST *block_list) { PreenXHeights(block_list); // Strip all fuzzy space markers to simplify the PAGE_RES. BLOCK_IT b_it(block_list); for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { BLOCK* block = b_it.data(); ROW_IT r_it(block->row_list()); for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) { ROW* row = r_it.data(); WERD_IT w_it(row->word_list()); for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { WERD* word = w_it.data(); if (word->cblob_list()->empty()) { delete w_it.extract(); } else { word->set_flag(W_FUZZY_SP, false); word->set_flag(W_FUZZY_NON, false); } } } } PAGE_RES* page_res = new PAGE_RES(false, block_list, NULL); PAGE_RES_IT pr_it(page_res); WERD_RES* word_res; while ((word_res = pr_it.word()) != NULL) { MaximallyChopWord(boxes, pr_it.block()->block, pr_it.row()->row, word_res); pr_it.forward(); } return page_res; }
// Returns true if there are any real classification results. bool MATRIX::Classified(int col, int row, int wildcard_id) const { if (get(col, row) == NOT_CLASSIFIED) return false; BLOB_CHOICE_IT b_it(get(col, row)); for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { BLOB_CHOICE* choice = b_it.data(); if (choice->IsClassified()) return true; } return false; }
// Generates training data for training a line recognizer, eg LSTM. // Breaks the boxes into lines, normalizes them, converts to ImageData and // appends them to the given training_data. void Tesseract::TrainFromBoxes(const GenericVector<TBOX>& boxes, const GenericVector<STRING>& texts, BLOCK_LIST *block_list, DocumentData* training_data) { int box_count = boxes.size(); // Process all the text lines in this page, as defined by the boxes. int end_box = 0; // Don't let \t, which marks newlines in the box file, get into the line // content, as that makes the line unusable in training. while (end_box < texts.size() && texts[end_box] == "\t") ++end_box; for (int start_box = end_box; start_box < box_count; start_box = end_box) { // Find the textline of boxes starting at start and their bounding box. TBOX line_box = boxes[start_box]; STRING line_str = texts[start_box]; for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t"; ++end_box) { line_box += boxes[end_box]; line_str += texts[end_box]; } // Find the most overlapping block. BLOCK* best_block = NULL; int best_overlap = 0; BLOCK_IT b_it(block_list); for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { BLOCK* block = b_it.data(); if (block->poly_block() != NULL && !block->poly_block()->IsText()) continue; // Not a text block. TBOX block_box = block->bounding_box(); block_box.rotate(block->re_rotation()); if (block_box.major_overlap(line_box)) { TBOX overlap_box = line_box.intersection(block_box); if (overlap_box.area() > best_overlap) { best_overlap = overlap_box.area(); best_block = block; } } } ImageData* imagedata = NULL; if (best_block == NULL) { tprintf("No block overlapping textline: %s\n", line_str.string()); } else { imagedata = GetLineData(line_box, boxes, texts, start_box, end_box, *best_block); } if (imagedata != NULL) training_data->AddPageToDocument(imagedata); // Don't let \t, which marks newlines in the box file, get into the line // content, as that makes the line unusable in training. while (end_box < texts.size() && texts[end_box] == "\t") ++end_box; } }
int compare_filenames(const std::string& a, const std::string& b) { utf8_const_iterator a_begin(a.begin()), a_end(a.end()); utf8_const_iterator b_begin(b.begin()), b_end(b.end()); utf8_const_iterator a_it(a_begin); utf8_const_iterator b_it(b_begin); for (; a_it != a_end && b_it != b_end; ) { int a_chr = *a_it; int b_chr = *b_it; if ((a_chr >= '0') && (a_chr <= '9') && (b_chr >= '0') && (b_chr <= '9')) { utf8_const_iterator a_it2 = a_it; utf8_const_iterator b_it2 = b_it; while (a_it2 != a_end && (*a_it2 >= '0') && (*a_it2 <= '9')) ++a_it2; while (b_it2 != b_end && (*b_it2 >= '0') && (*b_it2 <= '9')) ++b_it2; int a_num = std::strtol(std::string(a_it, a_it2).c_str(), NULL, 10); int b_num = std::strtol(std::string(b_it, b_it2).c_str(), NULL, 10); if (a_num != b_num) return a_num - b_num < 0 ? -1: 1; a_it = a_it2; b_it = b_it2; } else if (is_path_separator(a_chr) && is_path_separator(b_chr)) { ++a_it; ++b_it; } else { a_chr = std::tolower(a_chr); b_chr = std::tolower(b_chr); if (a_chr != b_chr) return a_chr - b_chr < 0 ? -1: 1; ++a_it; ++b_it; } } if (a_it == a_end && b_it == b_end) return 0; else if (a_it == a_end) return -1; else return 1; }
// Factory to build a TWERD from a (C_BLOB) WERD, with polygonal // approximation along the way. TWERD* TWERD::PolygonalCopy(WERD* src) { TWERD* tessword = new TWERD; tessword->latin_script = src->flag(W_SCRIPT_IS_LATIN); C_BLOB_IT b_it(src->cblob_list()); TBLOB *tail = NULL; for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { C_BLOB* blob = b_it.data(); TBLOB* tblob = TBLOB::PolygonalCopy(blob); if (tail == NULL) { tessword->blobs = tblob; } else { tail->next = tblob; } tail = tblob; } return tessword; }
/// Any row xheight that is significantly different from the median is set /// to the median. void Tesseract::PreenXHeights(BLOCK_LIST *block_list) { double median_xheight = MedianXHeight(block_list); double max_deviation = kMaxXHeightDeviationFraction * median_xheight; // Strip all fuzzy space markers to simplify the PAGE_RES. BLOCK_IT b_it(block_list); for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { BLOCK* block = b_it.data(); ROW_IT r_it(block->row_list()); for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) { ROW* row = r_it.data(); float diff = fabs(row->x_height() - median_xheight); if (diff > max_deviation) { if (applybox_debug) { tprintf("row xheight=%g, but median xheight = %g\n", row->x_height(), median_xheight); } row->set_x_height(static_cast<float>(median_xheight)); } } } }
// Places a copy of blobs that are near a word (after applying rotation to the // blob) in the most appropriate word, unless there is doubt, in which case a // blob can end up in two words. Source blobs are not touched. void Textord::TransferDiacriticsToWords(BLOBNBOX_LIST* diacritic_blobs, const FCOORD& rotation, WordGrid* word_grid) { WordSearch ws(word_grid); BLOBNBOX_IT b_it(diacritic_blobs); // Apply rotation to each blob before finding the nearest words. The rotation // allows us to only consider above/below placement and not left/right on // vertical text, because all text is horizontal here. for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { BLOBNBOX* blobnbox = b_it.data(); TBOX blob_box = blobnbox->bounding_box(); blob_box.rotate(rotation); ws.StartRectSearch(blob_box); // Above/below refer to word position relative to diacritic. Since some // scripts eg Kannada/Telugu habitually put diacritics below words, and // others eg Thai/Vietnamese/Latin put most diacritics above words, try // for both if there isn't much in it. WordWithBox* best_above_word = nullptr; WordWithBox* best_below_word = nullptr; int best_above_distance = 0; int best_below_distance = 0; for (WordWithBox* word = ws.NextRectSearch(); word != nullptr; word = ws.NextRectSearch()) { if (word->word()->flag(W_REP_CHAR)) continue; TBOX word_box = word->true_bounding_box(); int x_distance = blob_box.x_gap(word_box); int y_distance = blob_box.y_gap(word_box); if (x_distance > 0) { // Arbitrarily divide x-distance by 2 if there is a major y overlap, // and the word is to the left of the diacritic. If the // diacritic is a dropped broken character between two words, this will // help send all the pieces to a single word, instead of splitting them // over the 2 words. if (word_box.major_y_overlap(blob_box) && blob_box.left() > word_box.right()) { x_distance /= 2; } y_distance += x_distance; } if (word_box.y_middle() > blob_box.y_middle() && (best_above_word == nullptr || y_distance < best_above_distance)) { best_above_word = word; best_above_distance = y_distance; } if (word_box.y_middle() <= blob_box.y_middle() && (best_below_word == nullptr || y_distance < best_below_distance)) { best_below_word = word; best_below_distance = y_distance; } } bool above_good = best_above_word != nullptr && (best_below_word == nullptr || best_above_distance < best_below_distance + blob_box.height()); bool below_good = best_below_word != nullptr && best_below_word != best_above_word && (best_above_word == nullptr || best_below_distance < best_above_distance + blob_box.height()); if (below_good) { C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob()); copied_blob->rotate(rotation); // Put the blob into the word's reject blobs list. C_BLOB_IT blob_it(best_below_word->RejBlobs()); blob_it.add_to_end(copied_blob); } if (above_good) { C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob()); copied_blob->rotate(rotation); // Put the blob into the word's reject blobs list. C_BLOB_IT blob_it(best_above_word->RejBlobs()); blob_it.add_to_end(copied_blob); } } }
// Make the textlines and words inside each block. void Textord::TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width, int height, Pix *binary_pix, Pix *thresholds_pix, Pix *grey_pix, bool use_box_bottoms, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks) { page_tr_.set_x(width); page_tr_.set_y(height); if (to_blocks->empty()) { // AutoPageSeg was not used, so we need to find_components first. find_components(binary_pix, blocks, to_blocks); TO_BLOCK_IT it(to_blocks); for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { TO_BLOCK *to_block = it.data(); // Compute the edge offsets whether or not there is a grey_pix. // We have by-passed auto page seg, so we have to run it here. // By page segmentation mode there is no non-text to avoid running on. to_block->ComputeEdgeOffsets(thresholds_pix, grey_pix); } } else if (!PSM_SPARSE(pageseg_mode)) { // AutoPageSeg does not need to find_components as it did that already. // Filter_blobs sets up the TO_BLOCKs the same as find_components does. filter_blobs(page_tr_, to_blocks, true); } ASSERT_HOST(!to_blocks->empty()); if (pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT) { const FCOORD anticlockwise90(0.0f, 1.0f); const FCOORD clockwise90(0.0f, -1.0f); TO_BLOCK_IT it(to_blocks); for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { TO_BLOCK *to_block = it.data(); BLOCK *block = to_block->block; // Create a fake poly_block in block from its bounding box. block->set_poly_block(new POLY_BLOCK(block->bounding_box(), PT_VERTICAL_TEXT)); // Rotate the to_block along with its contained block and blobnbox lists. to_block->rotate(anticlockwise90); // Set the block's rotation values to obey the convention followed in // layout analysis for vertical text. block->set_re_rotation(clockwise90); block->set_classify_rotation(clockwise90); } } TO_BLOCK_IT to_block_it(to_blocks); TO_BLOCK *to_block = to_block_it.data(); // Make the rows in the block. float gradient = 0; // Do it the old fashioned way. if (PSM_LINE_FIND_ENABLED(pageseg_mode)) { gradient = make_rows(page_tr_, to_blocks); } else if (!PSM_SPARSE(pageseg_mode)) { // RAW_LINE, SINGLE_LINE, SINGLE_WORD and SINGLE_CHAR all need a single row. gradient = make_single_row(page_tr_, pageseg_mode != PSM_RAW_LINE, to_block, to_blocks); } BaselineDetect baseline_detector(textord_baseline_debug, reskew, to_blocks); baseline_detector.ComputeStraightBaselines(use_box_bottoms); baseline_detector.ComputeBaselineSplinesAndXheights(page_tr_, true, textord_heavy_nr, textord_show_final_rows, this); // Now make the words in the lines. if (PSM_WORD_FIND_ENABLED(pageseg_mode)) { // SINGLE_LINE uses the old word maker on the single line. make_words(this, page_tr_, gradient, blocks, to_blocks); } else { // SINGLE_WORD and SINGLE_CHAR cram all the blobs into a // single word, and in SINGLE_CHAR mode, all the outlines // go in a single blob. TO_BLOCK *to_block = to_block_it.data(); make_single_word(pageseg_mode == PSM_SINGLE_CHAR, to_block->get_rows(), to_block->block->row_list()); } cleanup_blocks(PSM_WORD_FIND_ENABLED(pageseg_mode), blocks); // Remove empties. // Compute the margins for each row in the block, to be used later for // paragraph detection. BLOCK_IT b_it(blocks); for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { b_it.data()->compute_row_margins(); } #ifndef GRAPHICS_DISABLED close_to_win(); #endif }
/// Consume all source blobs that strongly overlap the given box, /// putting them into a new word, with the correct_text label. /// Fights over which box owns which blobs are settled by /// applying the blobs to box or next_box with the least non-overlap. /// @return false if the box was in error, which can only be caused by /// failing to find an overlapping blob for a box. bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list, const TBOX& box, const TBOX& next_box, const char* correct_text) { if (applybox_debug > 1) { tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text); } WERD* new_word = NULL; BLOCK_IT b_it(block_list); for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { BLOCK* block = b_it.data(); if (!box.major_overlap(block->bounding_box())) continue; ROW_IT r_it(block->row_list()); for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) { ROW* row = r_it.data(); if (!box.major_overlap(row->bounding_box())) continue; WERD_IT w_it(row->word_list()); for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { WERD* word = w_it.data(); if (applybox_debug > 2) { tprintf("Checking word:"); word->bounding_box().print(); } if (word->text() != NULL && word->text()[0] != '\0') continue; // Ignore words that are already done. if (!box.major_overlap(word->bounding_box())) continue; C_BLOB_IT blob_it(word->cblob_list()); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { C_BLOB* blob = blob_it.data(); TBOX blob_box = blob->bounding_box(); if (!blob_box.major_overlap(box)) continue; double current_box_miss_metric = BoxMissMetric(blob_box, box); double next_box_miss_metric = BoxMissMetric(blob_box, next_box); if (applybox_debug > 2) { tprintf("Checking blob:"); blob_box.print(); tprintf("Current miss metric = %g, next = %g\n", current_box_miss_metric, next_box_miss_metric); } if (current_box_miss_metric > next_box_miss_metric) continue; // Blob is a better match for next box. if (applybox_debug > 2) { tprintf("Blob match: blob:"); blob_box.print(); tprintf("Matches box:"); box.print(); tprintf("With next box:"); next_box.print(); } if (new_word == NULL) { // Make a new word with a single blob. new_word = word->shallow_copy(); new_word->set_text(correct_text); w_it.add_to_end(new_word); } C_BLOB_IT new_blob_it(new_word->cblob_list()); new_blob_it.add_to_end(blob_it.extract()); } } } } if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n"); return new_word != NULL; }
void LMPainPoints::GenerateFromPath(float rating_cert_scale, ViterbiStateEntry *vse, WERD_RES *word_res) { ViterbiStateEntry *curr_vse = vse; BLOB_CHOICE *curr_b = vse->curr_b; // The following pain point generation and priority calculation approaches // prioritize exploring paths with low average rating of the known part of // the path, while not relying on the ratings of the pieces to be combined. // // A pain point to combine the neighbors is generated for each pair of // neighboring blobs on the path (the path is represented by vse argument // given to GenerateFromPath()). The priority of each pain point is set to // the average rating (per outline length) of the path, not including the // ratings of the blobs to be combined. // The ratings of the blobs to be combined are not used to calculate the // priority, since it is not possible to determine from their magnitude // whether it will be beneficial to combine the blobs. The reason is that // chopped junk blobs (/ | - ') can have very good (low) ratings, however // combining them will be beneficial. Blobs with high ratings might be // over-joined pieces of characters, but also could be blobs from an unseen // font or chopped pieces of complex characters. while (curr_vse->parent_vse != NULL) { ViterbiStateEntry* parent_vse = curr_vse->parent_vse; const MATRIX_COORD& curr_cell = curr_b->matrix_cell(); const MATRIX_COORD& parent_cell = parent_vse->curr_b->matrix_cell(); MATRIX_COORD pain_coord(parent_cell.col, curr_cell.row); if (!pain_coord.Valid(*word_res->ratings) || !word_res->ratings->Classified(parent_cell.col, curr_cell.row, dict_->WildcardID())) { // rat_subtr contains ratings sum of the two adjacent blobs to be merged. // rat_subtr will be subtracted from the ratings sum of the path, since // the blobs will be joined into a new blob, whose rating is yet unknown. float rat_subtr = curr_b->rating() + parent_vse->curr_b->rating(); // ol_subtr contains the outline length of the blobs that will be joined. float ol_subtr = AssociateUtils::ComputeOutlineLength(rating_cert_scale, *curr_b) + AssociateUtils::ComputeOutlineLength(rating_cert_scale, *(parent_vse->curr_b)); // ol_dif is the outline of the path without the two blobs to be joined. float ol_dif = vse->outline_length - ol_subtr; // priority is set to the average rating of the path per unit of outline, // not counting the ratings of the pieces to be joined. float priority = ol_dif > 0 ? (vse->ratings_sum-rat_subtr)/ol_dif : 0.0; GeneratePainPoint(pain_coord.col, pain_coord.row, LM_PPTYPE_PATH, priority, true, max_char_wh_ratio_, word_res); } else if (debug_level_ > 3) { tprintf("NO pain point (Classified) for col=%d row=%d type=%s\n", pain_coord.col, pain_coord.row, LMPainPointsTypeName[LM_PPTYPE_PATH]); BLOB_CHOICE_IT b_it(word_res->ratings->get(pain_coord.col, pain_coord.row)); for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { BLOB_CHOICE* choice = b_it.data(); choice->print_full(); } } curr_vse = parent_vse; curr_b = curr_vse->curr_b; } }