/// Resegments the word to achieve the target_text from the classifier. /// Returns false if the re-segmentation fails. /// Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and /// applies a full search on the classifier results to find the best classified /// segmentation. As a compromise to obtain better recall, 1-1 ambiguity /// substitutions ARE used. bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text, WERD_RES* word_res) { // Classify all required combinations of blobs and save results in choices. int word_length = word_res->box_word->length(); GenericVector<BLOB_CHOICE_LIST*>* choices = new GenericVector<BLOB_CHOICE_LIST*>[word_length]; for (int i = 0; i < word_length; ++i) { for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) { BLOB_CHOICE_LIST* match_result = classify_piece( word_res->seam_array, i, i + j - 1, "Applybox", word_res->chopped_word, word_res->blamer_bundle); if (applybox_debug > 2) { tprintf("%d+%d:", i, j); print_ratings_list("Segment:", match_result, unicharset); } choices[i].push_back(match_result); } } // Search the segmentation graph for the target text. Must be an exact // match. Using wildcards makes it difficult to find the correct // segmentation even when it is there. word_res->best_state.clear(); GenericVector<int> search_segmentation; float best_rating = 0.0f; SearchForText(choices, 0, word_length, target_text, 0, 0.0f, &search_segmentation, &best_rating, &word_res->best_state); for (int i = 0; i < word_length; ++i) choices[i].delete_data_pointers(); delete [] choices; if (word_res->best_state.empty()) { // Build the original segmentation and if it is the same length as the // truth, assume it will do. int blob_count = 1; for (int s = 0; s < word_res->seam_array.size(); ++s) { SEAM* seam = word_res->seam_array[s]; if (!seam->HasAnySplits()) { word_res->best_state.push_back(blob_count); blob_count = 1; } else { ++blob_count; } } word_res->best_state.push_back(blob_count); if (word_res->best_state.size() != target_text.size()) { word_res->best_state.clear(); // No good. Original segmentation bad size. return false; } } word_res->correct_text.clear(); for (int i = 0; i < target_text.size(); ++i) { word_res->correct_text.push_back( STRING(unicharset.id_to_unichar(target_text[i]))); } return true; }
/** * @name attempt_blob_chop * * Try to split the this blob after this one. Check to make sure that * it was successful. */ SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, const GenericVector<SEAM*>& seams) { if (repair_unchopped_blobs) preserve_outline_tree (blob->outlines); TBLOB *other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */ // Insert it into the word. word->blobs.insert(other_blob, blob_number + 1); SEAM *seam = nullptr; if (prioritize_division) { TPOINT location; if (divisible_blob(blob, italic_blob, &location)) { seam = new SEAM(0.0f, location); } } if (seam == nullptr) seam = pick_good_seam(blob); if (chop_debug) { if (seam != nullptr) seam->Print("Good seam picked="); else tprintf("\n** no seam picked *** \n"); } if (seam) { seam->ApplySeam(italic_blob, blob, other_blob); } seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob, seams, seam); if (seam == nullptr) { if (repair_unchopped_blobs) restore_outline_tree(blob->outlines); if (allow_blob_division && !prioritize_division) { // If the blob can simply be divided into outlines, then do that. TPOINT location; if (divisible_blob(blob, italic_blob, &location)) { other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */ word->blobs.insert(other_blob, blob_number + 1); seam = new SEAM(0.0f, location); seam->ApplySeam(italic_blob, blob, other_blob); seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob, seams, seam); } } } if (seam != nullptr) { // Make sure this seam doesn't get chopped again. seam->Finalize(); } return seam; }
/********************************************************************** * pick_good_seam * * Find and return a good seam that will split this blob into two pieces. * Work from the outlines provided. **********************************************************************/ SEAM *Wordrec::pick_good_seam(TBLOB *blob) { SeamPile seam_pile(chop_seam_pile_size); EDGEPT *points[MAX_NUM_POINTS]; EDGEPT_CLIST new_points; SEAM *seam = NULL; TESSLINE *outline; inT16 num_points = 0; #ifndef GRAPHICS_DISABLED if (chop_debug > 2) wordrec_display_splits.set_value(true); draw_blob_edges(blob); #endif PointHeap point_heap(MAX_NUM_POINTS); for (outline = blob->outlines; outline; outline = outline->next) prioritize_points(outline, &point_heap); while (!point_heap.empty() && num_points < MAX_NUM_POINTS) { points[num_points++] = point_heap.PeekTop().data; point_heap.Pop(NULL); } /* Initialize queue */ SeamQueue seam_queue(MAX_NUM_SEAMS); try_point_pairs(points, num_points, &seam_queue, &seam_pile, &seam, blob); try_vertical_splits(points, num_points, &new_points, &seam_queue, &seam_pile, &seam, blob); if (seam == NULL) { choose_best_seam(&seam_queue, NULL, BAD_PRIORITY, &seam, blob, &seam_pile); } else if (seam->priority() > chop_good_split) { choose_best_seam(&seam_queue, NULL, seam->priority(), &seam, blob, &seam_pile); } EDGEPT_C_IT it(&new_points); for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { EDGEPT *inserted_point = it.data(); if (seam == NULL || !seam->UsesPoint(inserted_point)) { for (outline = blob->outlines; outline; outline = outline->next) { if (outline->loop == inserted_point) { outline->loop = outline->loop->next; } } remove_edgept(inserted_point); } } if (seam) { if (seam->priority() > chop_ok_split) { delete seam; seam = NULL; } #ifndef GRAPHICS_DISABLED else if (wordrec_display_splits) { seam->Mark(edge_window); if (chop_debug > 2) { update_edge_window(); edge_window_wait(); } } #endif } if (chop_debug) wordrec_display_splits.set_value(false); return (seam); }
/********************************************************************** * choose_best_seam * * Choose the best seam that can be created by assembling this a * collection of splits. A queue of all the possible seams is * maintained. Each new split received is placed in that queue with * its partial priority value. These values in the seam queue are * evaluated and combined until a good enough seam is found. If no * further good seams are being found then this function returns to the * caller, who will send more splits. If this function is called with * a split of NULL, then no further splits can be supplied by the * caller. **********************************************************************/ void Wordrec::choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile) { SEAM *seam; char str[80]; float my_priority; /* Add seam of split */ my_priority = priority; if (split != NULL) { TPOINT split_point = split->point1->pos; split_point += split->point2->pos; split_point /= 2; seam = new SEAM(my_priority, split_point, *split); if (chop_debug > 1) seam->Print("Partial priority "); add_seam_to_queue(my_priority, seam, seam_queue); if (my_priority > chop_good_split) return; } TBOX bbox = blob->bounding_box(); /* Queue loop */ while (!seam_queue->empty()) { SeamPair seam_pair; seam_queue->Pop(&seam_pair); seam = seam_pair.extract_data(); /* Set full priority */ my_priority = seam->FullPriority(bbox.left(), bbox.right(), chop_overlap_knob, chop_centered_maxwidth, chop_center_knob, chop_width_change_knob); if (chop_debug) { sprintf (str, "Full my_priority %0.0f, ", my_priority); seam->Print(str); } if ((*seam_result == NULL || (*seam_result)->priority() > my_priority) && my_priority < chop_ok_split) { /* No crossing */ if (seam->IsHealthy(*blob, chop_min_outline_points, chop_min_outline_area)) { delete *seam_result; *seam_result = new SEAM(*seam); (*seam_result)->set_priority(my_priority); } else { delete seam; seam = NULL; my_priority = BAD_PRIORITY; } } if (my_priority < chop_good_split) { if (seam) delete seam; return; /* Made good answer */ } if (seam) { /* Combine with others */ if (seam_pile->size() < chop_seam_pile_size) { combine_seam(*seam_pile, seam, seam_queue); SeamDecPair pair(seam_pair.key(), seam); seam_pile->Push(&pair); } else if (chop_new_seam_pile && seam_pile->size() == chop_seam_pile_size && seam_pile->PeekTop().key() > seam_pair.key()) { combine_seam(*seam_pile, seam, seam_queue); SeamDecPair pair; seam_pile->Pop(&pair); // pop the worst. // Replace the seam in pair (deleting the old one) with // the new seam and score, then push back into the heap. pair.set_key(seam_pair.key()); pair.set_data(seam); seam_pile->Push(&pair); } else { delete seam; } } my_priority = seam_queue->empty() ? NO_FULL_PRIORITY : seam_queue->PeekTop().key(); if ((my_priority > chop_ok_split) || (my_priority > chop_good_split && split)) return; } }