// Returns a random number in [-range, range]. double Network::Random(double range) { ASSERT_HOST(randomizer_ != NULL); return randomizer_->SignedRand(range); }
void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings) { int num_blobs_to_replace = 0; int begin_blob_index = 0; int i; // Rating and certainty for the new BLOB_CHOICE are derived from the // replaced choices. float new_rating = 0.0f; float new_certainty = 0.0f; BLOB_CHOICE* old_choice = nullptr; for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) { if (i >= wrong_ngram_begin_index) { int num_blobs = werd_choice->state(i); int col = begin_blob_index + num_blobs_to_replace; int row = col + num_blobs - 1; BLOB_CHOICE_LIST* choices = ratings->get(col, row); ASSERT_HOST(choices != nullptr); old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices); ASSERT_HOST(old_choice != nullptr); new_rating += old_choice->rating(); new_certainty += old_choice->certainty(); num_blobs_to_replace += num_blobs; } else { begin_blob_index += werd_choice->state(i); } } new_certainty /= wrong_ngram_size; // If there is no entry in the ratings matrix, add it. MATRIX_COORD coord(begin_blob_index, begin_blob_index + num_blobs_to_replace - 1); if (!coord.Valid(*ratings)) { ratings->IncreaseBandSize(coord.row - coord.col + 1); } if (ratings->get(coord.col, coord.row) == nullptr) ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST); BLOB_CHOICE_LIST* new_choices = ratings->get(coord.col, coord.row); BLOB_CHOICE* choice = FindMatchingChoice(correct_ngram_id, new_choices); if (choice != nullptr) { // Already there. Upgrade if new rating better. if (new_rating < choice->rating()) choice->set_rating(new_rating); if (new_certainty < choice->certainty()) choice->set_certainty(new_certainty); // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState. } else { // Need a new choice with the correct_ngram_id. choice = new BLOB_CHOICE(*old_choice); choice->set_unichar_id(correct_ngram_id); choice->set_rating(new_rating); choice->set_certainty(new_certainty); choice->set_classifier(BCC_AMBIG); choice->set_matrix_cell(coord.col, coord.row); BLOB_CHOICE_IT it (new_choices); it.add_to_end(choice); } // Remove current unichar from werd_choice. On the last iteration // set the correct replacement unichar instead of removing a unichar. for (int replaced_count = 0; replaced_count < wrong_ngram_size; ++replaced_count) { if (replaced_count + 1 == wrong_ngram_size) { werd_choice->set_blob_choice(wrong_ngram_begin_index, num_blobs_to_replace, choice); } else { werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1); } } if (stopper_debug_level >= 1) { werd_choice->print("ReplaceAmbig() "); tprintf("Modified blob_choices: "); print_ratings_list("\n", new_choices, getUnicharset()); } }
inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score) { PBLOB_IT blob_it; inT16 blob_count; float noise_score[512]; int i; int min_noise_blob; //1st contender int max_noise_blob; //last contender int non_noise_count; int worst_noise_blob; //Worst blob float small_limit = bln_x_height * fixsp_small_outlines_size; float non_noise_limit = bln_x_height * 0.8; blob_it.set_to_list (word_res->outword->blob_list ()); //normalised blob_count = blob_it.length (); ASSERT_HOST (blob_count <= 512); if (blob_count < 5) return -1; //too short to split /* Get the noise scores for all blobs */ #ifndef SECURE_NAMES if (debug_fix_space_level > 5) tprintf ("FP fixspace Noise metrics for \"%s\": ", word_res->best_choice->string ().string ()); #endif for (i = 0; i < blob_count; i++, blob_it.forward ()) { if (word_res->reject_map[i].accepted ()) noise_score[i] = non_noise_limit; else noise_score[i] = blob_noise_score (blob_it.data ()); if (debug_fix_space_level > 5) tprintf ("%1.1f ", noise_score[i]); } if (debug_fix_space_level > 5) tprintf ("\n"); /* Now find the worst one which is far enough away from the end of the word */ non_noise_count = 0; for (i = 0; (i < blob_count) && (non_noise_count < fixsp_non_noise_limit); i++) { if (noise_score[i] >= non_noise_limit) non_noise_count++; } if (non_noise_count < fixsp_non_noise_limit) return -1; min_noise_blob = i; non_noise_count = 0; for (i = blob_count - 1; (i >= 0) && (non_noise_count < fixsp_non_noise_limit); i--) { if (noise_score[i] >= non_noise_limit) non_noise_count++; } if (non_noise_count < fixsp_non_noise_limit) return -1; max_noise_blob = i; if (min_noise_blob > max_noise_blob) return -1; *worst_noise_score = small_limit; worst_noise_blob = -1; for (i = min_noise_blob; i <= max_noise_blob; i++) { if (noise_score[i] < *worst_noise_score) { worst_noise_blob = i; *worst_noise_score = noise_score[i]; } } return worst_noise_blob; }
// Top-level method to perform splitting based on current settings. // Returns true if a split was actually performed. // split_for_pageseg should be true if the splitting is being done prior to // page segmentation. This mode uses the flag // pageseg_devanagari_split_strategy to determine the splitting strategy. bool ShiroRekhaSplitter::Split(bool split_for_pageseg) { SplitStrategy split_strategy = split_for_pageseg ? pageseg_split_strategy_ : ocr_split_strategy_; if (split_strategy == NO_SPLIT) { return false; // Nothing to do. } ASSERT_HOST(split_strategy == MINIMAL_SPLIT || split_strategy == MAXIMAL_SPLIT); ASSERT_HOST(orig_pix_); if (devanagari_split_debuglevel > 0) { tprintf("Splitting shiro-rekha ...\n"); tprintf("Split strategy = %s\n", split_strategy == MINIMAL_SPLIT ? "Minimal" : "Maximal"); tprintf("Initial pageseg available = %s\n", segmentation_block_list_ ? "yes" : "no"); } // Create a copy of original image to store the splitting output. pixDestroy(&splitted_image_); splitted_image_ = pixCopy(NULL, orig_pix_); // Initialize debug image if required. if (devanagari_split_debugimage) { pixDestroy(&debug_image_); debug_image_ = pixConvertTo32(orig_pix_); } // Determine all connected components in the input image. A close operation // may be required prior to this, depending on the current settings. Pix* pix_for_ccs = pixClone(orig_pix_); if (perform_close_ && global_xheight_ != kUnspecifiedXheight && !segmentation_block_list_) { if (devanagari_split_debuglevel > 0) { tprintf("Performing a global close operation..\n"); } // A global measure is available for xheight, but no local information // exists. pixDestroy(&pix_for_ccs); pix_for_ccs = pixCopy(NULL, orig_pix_); PerformClose(pix_for_ccs, global_xheight_); } Pixa* ccs; Boxa* tmp_boxa = pixConnComp(pix_for_ccs, &ccs, 8); boxaDestroy(&tmp_boxa); pixDestroy(&pix_for_ccs); // Iterate over all connected components. Get their bounding boxes and clip // out the image regions corresponding to these boxes from the original image. // Conditionally run splitting on each of them. Boxa* regions_to_clear = boxaCreate(0); for (int i = 0; i < pixaGetCount(ccs); ++i) { Box* box = ccs->boxa->box[i]; Pix* word_pix = pixClipRectangle(orig_pix_, box, NULL); ASSERT_HOST(word_pix); int xheight = GetXheightForCC(box); if (xheight == kUnspecifiedXheight && segmentation_block_list_ && devanagari_split_debugimage) { pixRenderBoxArb(debug_image_, box, 1, 255, 0, 0); } // If some xheight measure is available, attempt to pre-eliminate small // blobs from the shiro-rekha process. This is primarily to save the CCs // corresponding to punctuation marks/small dots etc which are part of // larger graphemes. if (xheight == kUnspecifiedXheight || (box->w > xheight / 3 && box->h > xheight / 2)) { SplitWordShiroRekha(split_strategy, word_pix, xheight, box->x, box->y, regions_to_clear); } else if (devanagari_split_debuglevel > 0) { tprintf("CC dropped from splitting: %d,%d (%d, %d)\n", box->x, box->y, box->w, box->h); } pixDestroy(&word_pix); } // Actually clear the boxes now. for (int i = 0; i < boxaGetCount(regions_to_clear); ++i) { Box* box = boxaGetBox(regions_to_clear, i, L_CLONE); pixClearInRect(splitted_image_, box); boxDestroy(&box); } boxaDestroy(®ions_to_clear); pixaDestroy(&ccs); if (devanagari_split_debugimage) { DumpDebugImage(split_for_pageseg ? "pageseg_split_debug.png" : "ocr_split_debug.png"); } return true; }
// Attempt to improve this by adding partitions or expanding partitions. void ColPartitionSet::ImproveColumnCandidate(WidthCallback* cb, PartSetVector* src_sets) { int set_size = src_sets->size(); // Iterate over the provided column sets, as each one may have something // to improve this. for (int i = 0; i < set_size; ++i) { ColPartitionSet* column_set = src_sets->get(i); if (column_set == NULL) continue; // Iterate over the parts in this and column_set, adding bigger or // new parts in column_set to this. ColPartition_IT part_it(&parts_); ASSERT_HOST(!part_it.empty()); int prev_right = MIN_INT32; part_it.mark_cycle_pt(); ColPartition_IT col_it(&column_set->parts_); for (col_it.mark_cycle_pt(); !col_it.cycled_list(); col_it.forward()) { ColPartition* col_part = col_it.data(); if (col_part->blob_type() < BRT_UNKNOWN) continue; // Ignore image partitions. int col_left = col_part->left_key(); int col_right = col_part->right_key(); // Sync-up part_it (in this) so it matches the col_part in column_set. ColPartition* part = part_it.data(); while (!part_it.at_last() && part->right_key() < col_left) { prev_right = part->right_key(); part_it.forward(); part = part_it.data(); } int part_left = part->left_key(); int part_right = part->right_key(); if (part_right < col_left || col_right < part_left) { // There is no overlap so this is a new partition. AddPartition(col_part->ShallowCopy(), &part_it); continue; } // Check the edges of col_part to see if they can improve part. bool part_width_ok = cb->Run(part->KeyWidth(part_left, part_right)); if (col_left < part_left && col_left > prev_right) { // The left edge of the column is better and it doesn't overlap, // so we can potentially expand it. int col_box_left = col_part->BoxLeftKey(); bool tab_width_ok = cb->Run(part->KeyWidth(col_left, part_right)); bool box_width_ok = cb->Run(part->KeyWidth(col_box_left, part_right)); if (tab_width_ok || (!part_width_ok )) { // The tab is leaving the good column metric at least as good as // it was before, so use the tab. part->CopyLeftTab(*col_part, false); part->SetColumnGoodness(cb); } else if (col_box_left < part_left && (box_width_ok || !part_width_ok)) { // The box is leaving the good column metric at least as good as // it was before, so use the box. part->CopyLeftTab(*col_part, true); part->SetColumnGoodness(cb); } part_left = part->left_key(); } if (col_right > part_right && (part_it.at_last() || part_it.data_relative(1)->left_key() > col_right)) { // The right edge is better, so we can possibly expand it. int col_box_right = col_part->BoxRightKey(); bool tab_width_ok = cb->Run(part->KeyWidth(part_left, col_right)); bool box_width_ok = cb->Run(part->KeyWidth(part_left, col_box_right)); if (tab_width_ok || (!part_width_ok )) { // The tab is leaving the good column metric at least as good as // it was before, so use the tab. part->CopyRightTab(*col_part, false); part->SetColumnGoodness(cb); } else if (col_box_right > part_right && (box_width_ok || !part_width_ok)) { // The box is leaving the good column metric at least as good as // it was before, so use the box. part->CopyRightTab(*col_part, true); part->SetColumnGoodness(cb); } } } } ComputeCoverage(); }
/** * Sets up auto page segmentation, determines the orientation, and corrects it. * Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to * facilitate testing. * photo_mask_pix is a pointer to a NULL pointer that will be filled on return * with the leptonica photo mask, which must be pixDestroyed by the caller. * to_blocks is an empty list that will be filled with (usually a single) * block that is used during layout analysis. This ugly API is required * because of the possibility of a unlv zone file. * TODO(rays) clean this up. * See AutoPageSeg for other arguments. * The returned ColumnFinder must be deleted after use. */ ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation( PageSegMode pageseg_mode, BLOCK_LIST* blocks, Tesseract* osd_tess, OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix, Pix** music_mask_pix) { int vertical_x = 0; int vertical_y = 1; TabVector_LIST v_lines; TabVector_LIST h_lines; ICOORD bleft(0, 0); ASSERT_HOST(pix_binary_ != NULL); if (tessedit_dump_pageseg_images) { pixa_debug_.AddPix(pix_binary_, "PageSegInput"); } // Leptonica is used to find the rule/separator lines in the input. LineFinder::FindAndRemoveLines(source_resolution_, textord_tabfind_show_vlines, pix_binary_, &vertical_x, &vertical_y, music_mask_pix, &v_lines, &h_lines); if (tessedit_dump_pageseg_images) { pixa_debug_.AddPix(pix_binary_, "NoLines"); } // Leptonica is used to find a mask of the photo regions in the input. *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_); if (tessedit_dump_pageseg_images) { pixa_debug_.AddPix(pix_binary_, "NoImages"); } if (!PSM_COL_FIND_ENABLED(pageseg_mode)) v_lines.clear(); // The rest of the algorithm uses the usual connected components. textord_.find_components(pix_binary_, blocks, to_blocks); TO_BLOCK_IT to_block_it(to_blocks); // There must be exactly one input block. // TODO(rays) handle new textline finding with a UNLV zone file. ASSERT_HOST(to_blocks->singleton()); TO_BLOCK* to_block = to_block_it.data(); TBOX blkbox = to_block->block->bounding_box(); ColumnFinder* finder = NULL; int estimated_resolution = source_resolution_; if (source_resolution_ == kMinCredibleResolution) { // Try to estimate resolution from typical body text size. int res = IntCastRounded(to_block->line_size * kResolutionEstimationFactor); if (res > estimated_resolution && res < kMaxCredibleResolution) { estimated_resolution = res; tprintf("Estimating resolution as %d\n", estimated_resolution); } } if (to_block->line_size >= 2) { finder = new ColumnFinder(static_cast<int>(to_block->line_size), blkbox.botleft(), blkbox.topright(), estimated_resolution, textord_use_cjk_fp_model, textord_tabfind_aligned_gap_fraction, &v_lines, &h_lines, vertical_x, vertical_y); finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block); if (equ_detect_) { equ_detect_->LabelSpecialText(to_block); } BLOBNBOX_CLIST osd_blobs; // osd_orientation is the number of 90 degree rotations to make the // characters upright. (See osdetect.h for precise definition.) // We want the text lines horizontal, (vertical text indicates vertical // textlines) which may conflict (eg vertically written CJK). int osd_orientation = 0; bool vertical_text = textord_tabfind_force_vertical_text || pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT; if (!vertical_text && textord_tabfind_vertical_text && PSM_ORIENTATION_ENABLED(pageseg_mode)) { vertical_text = finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio, to_block, &osd_blobs); } if (PSM_OSD_ENABLED(pageseg_mode) && osd_tess != NULL && osr != NULL) { GenericVector<int> osd_scripts; if (osd_tess != this) { // We are running osd as part of layout analysis, so constrain the // scripts to those allowed by *this. AddAllScriptsConverted(unicharset, osd_tess->unicharset, &osd_scripts); for (int s = 0; s < sub_langs_.size(); ++s) { AddAllScriptsConverted(sub_langs_[s]->unicharset, osd_tess->unicharset, &osd_scripts); } } os_detect_blobs(&osd_scripts, &osd_blobs, osr, osd_tess); if (pageseg_mode == PSM_OSD_ONLY) { delete finder; return NULL; } osd_orientation = osr->best_result.orientation_id; double osd_score = osr->orientations[osd_orientation]; double osd_margin = min_orientation_margin * 2; for (int i = 0; i < 4; ++i) { if (i != osd_orientation && osd_score - osr->orientations[i] < osd_margin) { osd_margin = osd_score - osr->orientations[i]; } } int best_script_id = osr->best_result.script_id; const char* best_script_str = osd_tess->unicharset.get_script_from_script_id(best_script_id); bool cjk = best_script_id == osd_tess->unicharset.han_sid() || best_script_id == osd_tess->unicharset.hiragana_sid() || best_script_id == osd_tess->unicharset.katakana_sid() || strcmp("Japanese", best_script_str) == 0 || strcmp("Korean", best_script_str) == 0 || strcmp("Hangul", best_script_str) == 0; if (cjk) { finder->set_cjk_script(true); } if (osd_margin < min_orientation_margin) { // The margin is weak. if (!cjk && !vertical_text && osd_orientation == 2) { // upside down latin text is improbable with such a weak margin. tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: " "Don't rotate.\n", osd_margin); osd_orientation = 0; } else { tprintf( "OSD: Weak margin (%.2f) for %d blob text block, " "but using orientation anyway: %d\n", osd_margin, osd_blobs.length(), osd_orientation); } } } osd_blobs.shallow_clear(); finder->CorrectOrientation(to_block, vertical_text, osd_orientation); } return finder; }
/** * This routine reads a textual description of a prototype from * the specified file. * * Exceptions: * - ILLEGALSIGNIFICANCESPEC * - ILLEGALSAMPLECOUNT * - ILLEGALMEANSPEC * - ILLEGALVARIANCESPEC * - ILLEGALDISTRIBUTION * @param File open text file to read prototype from * @param N number of dimensions used in prototype * @return List of prototypes * @note Globals: None * @note History: 6/6/89, DSJ, Created. */ PROTOTYPE *ReadPrototype(FILE *File, uinT16 N) { char Token[TOKENSIZE]; int Status; PROTOTYPE *Proto; int SampleCount; int i; if ((Status = tfscanf(File, "%s", Token)) == 1) { Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE)); Proto->Cluster = NULL; if (Token[0] == 's') Proto->Significant = TRUE; else Proto->Significant = FALSE; Proto->Style = ReadProtoStyle (File); if ((tfscanf(File, "%d", &SampleCount) != 1) || (SampleCount < 0)) DoError (ILLEGALSAMPLECOUNT, "Illegal sample count"); Proto->NumSamples = SampleCount; Proto->Mean = ReadNFloats (File, N, NULL); if (Proto->Mean == NULL) DoError (ILLEGALMEANSPEC, "Illegal prototype mean"); switch (Proto->Style) { case spherical: if (ReadNFloats (File, 1, &(Proto->Variance.Spherical)) == NULL) DoError (ILLEGALVARIANCESPEC, "Illegal prototype variance"); Proto->Magnitude.Spherical = 1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical)); Proto->TotalMagnitude = pow (Proto->Magnitude.Spherical, (float) N); Proto->LogMagnitude = log ((double) Proto->TotalMagnitude); Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical; Proto->Distrib = NULL; break; case elliptical: Proto->Variance.Elliptical = ReadNFloats (File, N, NULL); if (Proto->Variance.Elliptical == NULL) DoError (ILLEGALVARIANCESPEC, "Illegal prototype variance"); Proto->Magnitude.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); Proto->Weight.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); Proto->TotalMagnitude = 1.0; for (i = 0; i < N; i++) { Proto->Magnitude.Elliptical[i] = 1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i])); Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i]; Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; } Proto->LogMagnitude = log ((double) Proto->TotalMagnitude); Proto->Distrib = NULL; break; case mixed: Proto->Distrib = (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION)); for (i = 0; i < N; i++) { if (tfscanf(File, "%s", Token) != 1) DoError (ILLEGALDISTRIBUTION, "Illegal prototype distribution"); switch (Token[0]) { case 'n': Proto->Distrib[i] = normal; break; case 'u': Proto->Distrib[i] = uniform; break; case 'r': Proto->Distrib[i] = D_random; break; default: DoError (ILLEGALDISTRIBUTION, "Illegal prototype distribution"); } } Proto->Variance.Elliptical = ReadNFloats (File, N, NULL); if (Proto->Variance.Elliptical == NULL) DoError (ILLEGALVARIANCESPEC, "Illegal prototype variance"); Proto->Magnitude.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); Proto->Weight.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); Proto->TotalMagnitude = 1.0; for (i = 0; i < N; i++) { switch (Proto->Distrib[i]) { case normal: Proto->Magnitude.Elliptical[i] = 1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i])); Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i]; break; case uniform: case D_random: Proto->Magnitude.Elliptical[i] = 1.0 / (2.0 * Proto->Variance.Elliptical[i]); break; case DISTRIBUTION_COUNT: ASSERT_HOST(!"Distribution count not allowed!"); } Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; } Proto->LogMagnitude = log ((double) Proto->TotalMagnitude); break; } return (Proto); } else if (Status == EOF) return (NULL); else { DoError (ILLEGALSIGNIFICANCESPEC, "Illegal significance specification"); return (NULL); } }
/** * Sets up auto page segmentation, determines the orientation, and corrects it. * Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to * facilitate testing. * photo_mask_pix is a pointer to a NULL pointer that will be filled on return * with the leptonica photo mask, which must be pixDestroyed by the caller. * to_blocks is an empty list that will be filled with (usually a single) * block that is used during layout analysis. This ugly API is required * because of the possibility of a unlv zone file. * TODO(rays) clean this up. * See AutoPageSeg for other arguments. * The returned ColumnFinder must be deleted after use. */ ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation( bool single_column, bool osd, bool only_osd, BLOCK_LIST* blocks, Tesseract* osd_tess, OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix, Pix** music_mask_pix) { int vertical_x = 0; int vertical_y = 1; TabVector_LIST v_lines; TabVector_LIST h_lines; ICOORD bleft(0, 0); ASSERT_HOST(pix_binary_ != NULL); if (tessedit_dump_pageseg_images) { pixWrite("tessinput.png", pix_binary_, IFF_PNG); } // Leptonica is used to find the rule/separator lines in the input. LineFinder::FindAndRemoveLines(source_resolution_, textord_tabfind_show_vlines, pix_binary_, &vertical_x, &vertical_y, music_mask_pix, &v_lines, &h_lines); if (tessedit_dump_pageseg_images) pixWrite("tessnolines.png", pix_binary_, IFF_PNG); // Leptonica is used to find a mask of the photo regions in the input. *photo_mask_pix = ImageFind::FindImages(pix_binary_); if (tessedit_dump_pageseg_images) pixWrite("tessnoimages.png", pix_binary_, IFF_PNG); if (single_column) v_lines.clear(); // The rest of the algorithm uses the usual connected components. textord_.find_components(pix_binary_, blocks, to_blocks); TO_BLOCK_IT to_block_it(to_blocks); // There must be exactly one input block. // TODO(rays) handle new textline finding with a UNLV zone file. ASSERT_HOST(to_blocks->singleton()); TO_BLOCK* to_block = to_block_it.data(); TBOX blkbox = to_block->block->bounding_box(); ColumnFinder* finder = NULL; if (to_block->line_size >= 2) { finder = new ColumnFinder(static_cast<int>(to_block->line_size), blkbox.botleft(), blkbox.topright(), source_resolution_, &v_lines, &h_lines, vertical_x, vertical_y); finder->SetupAndFilterNoise(*photo_mask_pix, to_block); if (equ_detect_) { equ_detect_->LabelSpecialText(to_block); } BLOBNBOX_CLIST osd_blobs; // osd_orientation is the number of 90 degree rotations to make the // characters upright. (See osdetect.h for precise definition.) // We want the text lines horizontal, (vertical text indicates vertical // textlines) which may conflict (eg vertically written CJK). int osd_orientation = 0; bool vertical_text = finder->IsVerticallyAlignedText(to_block, &osd_blobs); if (osd && osd_tess != NULL && osr != NULL) { os_detect_blobs(&osd_blobs, osr, osd_tess); if (only_osd) { delete finder; return NULL; } osd_orientation = osr->best_result.orientation_id; double osd_score = osr->orientations[osd_orientation]; double osd_margin = min_orientation_margin * 2; for (int i = 0; i < 4; ++i) { if (i != osd_orientation && osd_score - osr->orientations[i] < osd_margin) { osd_margin = osd_score - osr->orientations[i]; } } if (osd_margin < min_orientation_margin) { // The margin is weak. int best_script_id = osr->best_result.script_id; bool cjk = (best_script_id == osd_tess->unicharset.han_sid()) || (best_script_id == osd_tess->unicharset.hiragana_sid()) || (best_script_id == osd_tess->unicharset.katakana_sid()); if (!cjk && !vertical_text && osd_orientation == 2) { // upside down latin text is improbable with such a weak margin. tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: " "Don't rotate.\n", osd_margin); osd_orientation = 0; } else { tprintf("OSD: Weak margin (%.2f) for %d blob text block, " "but using orientation anyway: %d\n", osd_blobs.length(), osd_margin, osd_orientation); } } } osd_blobs.shallow_clear(); finder->CorrectOrientation(to_block, vertical_text, osd_orientation); } return finder; }
// Returns the mean confidence of the current object at the given level. // The number should be interpreted as a percent probability. (0.0f-100.0f) float LTRResultIterator::Confidence(PageIteratorLevel level) const { if (it_->word() == NULL) return 0.0f; // Already at the end! float mean_certainty = 0.0f; int certainty_count = 0; PAGE_RES_IT res_it(*it_); WERD_CHOICE* best_choice = res_it.word()->best_choice; ASSERT_HOST(best_choice != NULL); switch (level) { case RIL_BLOCK: do { best_choice = res_it.word()->best_choice; ASSERT_HOST(best_choice != NULL); mean_certainty += best_choice->certainty(); ++certainty_count; res_it.forward(); } while (res_it.block() == res_it.prev_block()); break; case RIL_PARA: do { best_choice = res_it.word()->best_choice; ASSERT_HOST(best_choice != NULL); mean_certainty += best_choice->certainty(); ++certainty_count; res_it.forward(); } while (res_it.block() == res_it.prev_block() && res_it.row()->row->para() == res_it.prev_row()->row->para()); break; case RIL_TEXTLINE: do { best_choice = res_it.word()->best_choice; ASSERT_HOST(best_choice != NULL); mean_certainty += best_choice->certainty(); ++certainty_count; res_it.forward(); } while (res_it.row() == res_it.prev_row()); break; case RIL_WORD: mean_certainty += best_choice->certainty(); ++certainty_count; break; case RIL_SYMBOL: BLOB_CHOICE_LIST_CLIST* choices = best_choice->blob_choices(); if (choices != NULL) { BLOB_CHOICE_LIST_C_IT blob_choices_it(choices); for (int blob = 0; blob < blob_index_; ++blob) blob_choices_it.forward(); BLOB_CHOICE_IT choice_it(blob_choices_it.data()); for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) { if (choice_it.data()->unichar_id() == best_choice->unichar_id(blob_index_)) break; } mean_certainty += choice_it.data()->certainty(); } else { mean_certainty += best_choice->certainty(); } ++certainty_count; } if (certainty_count > 0) { mean_certainty /= certainty_count; float confidence = 100 + 5 * mean_certainty; if (confidence < 0.0f) confidence = 0.0f; if (confidence > 100.0f) confidence = 100.0f; return confidence; } return 0.0f; }
/** * Split input into space-separated tokens, strip trailing punctuation * from each, determine case properties, call UTF-8 flavor of cost * function on each word, and aggregate all into single mean word * cost. */ int WordUnigrams::Cost(const char_32 *key_str32, LangModel *lang_mod, CharSet *char_set) const { if (!key_str32) return 0; // convert string to UTF8 to split into space-separated words string key_str; CubeUtils::UTF32ToUTF8(key_str32, &key_str); vector<string> words; CubeUtils::SplitStringUsing(key_str, " \t", &words); // no words => no cost if (words.size() <= 0) { return 0; } // aggregate the costs of all the words int cost = 0; for (int word_idx = 0; word_idx < words.size(); word_idx++) { // convert each word back to UTF32 for analyzing case and punctuation string_32 str32; CubeUtils::UTF8ToUTF32(words[word_idx].c_str(), &str32); int len = CubeUtils::StrLen(str32.c_str()); // strip all trailing punctuation string clean_str; int clean_len = len; bool trunc = false; while (clean_len > 0 && lang_mod->IsTrailingPunc(str32.c_str()[clean_len - 1])) { --clean_len; trunc = true; } // If either the original string was not truncated (no trailing // punctuation) or the entire string was removed (all characters // are trailing punctuation), evaluate original word as is; // otherwise, copy all but the trailing punctuation characters char_32 *clean_str32 = NULL; if (clean_len == 0 || !trunc) { clean_str32 = CubeUtils::StrDup(str32.c_str()); } else { clean_str32 = new char_32[clean_len + 1]; for (int i = 0; i < clean_len; ++i) { clean_str32[i] = str32[i]; } clean_str32[clean_len] = '\0'; } ASSERT_HOST(clean_str32 != NULL); string str8; CubeUtils::UTF32ToUTF8(clean_str32, &str8); int word_cost = CostInternal(str8.c_str()); // if case invariant, get costs of all-upper-case and all-lower-case // versions and return the min cost if (clean_len >= kMinLengthNumOrCaseInvariant && CubeUtils::IsCaseInvariant(clean_str32, char_set)) { char_32 *lower_32 = CubeUtils::ToLower(clean_str32, char_set); if (lower_32) { string lower_8; CubeUtils::UTF32ToUTF8(lower_32, &lower_8); word_cost = MIN(word_cost, CostInternal(lower_8.c_str())); delete [] lower_32; } char_32 *upper_32 = CubeUtils::ToUpper(clean_str32, char_set); if (upper_32) { string upper_8; CubeUtils::UTF32ToUTF8(upper_32, &upper_8); word_cost = MIN(word_cost, CostInternal(upper_8.c_str())); delete [] upper_32; } } if (clean_len >= kMinLengthNumOrCaseInvariant) { // if characters are all numeric, incur 0 word cost bool is_numeric = true; for (int i = 0; i < clean_len; ++i) { if (!lang_mod->IsDigit(clean_str32[i])) is_numeric = false; } if (is_numeric) word_cost = 0; } delete [] clean_str32; cost += word_cost; } // word_idx // return the mean cost return static_cast<int>(cost / static_cast<double>(words.size())); }
/** * Segment the page according to the current value of tessedit_pageseg_mode. * pix_binary_ is used as the source image and should not be NULL. * On return the blocks list owns all the constructed page layout. */ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks, Tesseract* osd_tess, OSResults* osr) { ASSERT_HOST(pix_binary_ != NULL); int width = pixGetWidth(pix_binary_); int height = pixGetHeight(pix_binary_); // Get page segmentation mode. PageSegMode pageseg_mode = static_cast<PageSegMode>( static_cast<int>(tessedit_pageseg_mode)); // If a UNLV zone file can be found, use that instead of segmentation. if (!PSM_COL_FIND_ENABLED(pageseg_mode) && input_file != NULL && input_file->length() > 0) { STRING name = *input_file; const char* lastdot = strrchr(name.string(), '.'); if (lastdot != NULL) name[lastdot - name.string()] = '\0'; read_unlv_file(name, width, height, blocks); } if (blocks->empty()) { // No UNLV file present. Work according to the PageSegMode. // First make a single block covering the whole image. BLOCK_IT block_it(blocks); BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height); block->set_right_to_left(right_to_left()); block_it.add_to_end(block); } else { // UNLV file present. Use PSM_SINGLE_BLOCK. pageseg_mode = PSM_SINGLE_BLOCK; } int auto_page_seg_ret_val = 0; TO_BLOCK_LIST to_blocks; if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) || PSM_SPARSE(pageseg_mode)) { auto_page_seg_ret_val = AutoPageSeg(pageseg_mode, blocks, &to_blocks, osd_tess, osr); if (pageseg_mode == PSM_OSD_ONLY) return auto_page_seg_ret_val; // To create blobs from the image region bounds uncomment this line: // to_blocks.clear(); // Uncomment to go back to the old mode. } else { deskew_ = FCOORD(1.0f, 0.0f); reskew_ = FCOORD(1.0f, 0.0f); if (pageseg_mode == PSM_CIRCLE_WORD) { Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_); if (pixcleaned != NULL) { pixDestroy(&pix_binary_); pix_binary_ = pixcleaned; } } } if (auto_page_seg_ret_val < 0) { return -1; } if (blocks->empty()) { if (textord_debug_tabfind) tprintf("Empty page\n"); return 0; // AutoPageSeg found an empty page. } textord_.TextordPage(pageseg_mode, width, height, pix_binary_, blocks, &to_blocks); return auto_page_seg_ret_val; }
/************************************************************************* * write_results() * * All recognition and rejection has now been done. Generate the following: * .txt file - giving the final best choices with NO highlighting * .raw file - giving the tesseract top choice output for each word * .map file - showing how the .txt file has been rejected in the .ep file * epchoice list - a list of one element per word, containing the text for the * epaper. Reject strings are inserted. * inset list - a list of bounding boxes of reject insets - indexed by the * reject strings in the epchoice text. *************************************************************************/ void Tesseract::write_results(PAGE_RES_IT &page_res_it, char newline_type, // type of newline BOOL8 force_eol) { // override tilde crunch? WERD_RES *word = page_res_it.word(); const UNICHARSET &uchset = *word->uch_set; STRING repetition_code; const STRING *wordstr; STRING wordstr_lengths; int i; char unrecognised = STRING (unrecognised_char)[0]; char ep_chars[32]; //Only for unlv_tilde_crunch int ep_chars_index = 0; char txt_chs[32]; //Only for unlv_tilde_crunch char map_chs[32]; //Only for unlv_tilde_crunch int txt_index = 0; BOOL8 need_reject = FALSE; UNICHAR_ID space = uchset.unichar_to_id(" "); if ((word->unlv_crunch_mode != CR_NONE || word->best_choice->length() == 0) && !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) { if ((word->unlv_crunch_mode != CR_DELETE) && (!stats_.tilde_crunch_written || ((word->unlv_crunch_mode == CR_KEEP_SPACE) && (word->word->space () > 0) && !word->word->flag (W_FUZZY_NON) && !word->word->flag (W_FUZZY_SP)))) { if (!word->word->flag (W_BOL) && (word->word->space () > 0) && !word->word->flag (W_FUZZY_NON) && !word->word->flag (W_FUZZY_SP)) { // Write a space to separate from preceeding good text. txt_chs[txt_index] = ' '; map_chs[txt_index++] = '1'; ep_chars[ep_chars_index++] = ' '; stats_.last_char_was_tilde = false; } need_reject = TRUE; } if ((need_reject && !stats_.last_char_was_tilde) || (force_eol && stats_.write_results_empty_block)) { /* Write a reject char - mark as rejected unless zero_rejection mode */ stats_.last_char_was_tilde = TRUE; txt_chs[txt_index] = unrecognised; if (tessedit_zero_rejection || (suspect_level == 0)) { map_chs[txt_index++] = '1'; ep_chars[ep_chars_index++] = unrecognised; } else { map_chs[txt_index++] = '0'; /* The ep_choice string is a faked reject to allow newdiff to sync the .etx with the .txt and .map files. */ ep_chars[ep_chars_index++] = CTRL_INSET; // escape code //dummy reject ep_chars[ep_chars_index++] = 1; //dummy reject ep_chars[ep_chars_index++] = 1; //type ep_chars[ep_chars_index++] = 2; //dummy reject ep_chars[ep_chars_index++] = 1; //dummy reject ep_chars[ep_chars_index++] = 1; } stats_.tilde_crunch_written = true; stats_.last_char_was_newline = false; stats_.write_results_empty_block = false; } if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) { /* Add a new line output */ txt_chs[txt_index] = '\n'; map_chs[txt_index++] = '\n'; //end line ep_chars[ep_chars_index++] = newline_type; //Cos of the real newline stats_.tilde_crunch_written = false; stats_.last_char_was_newline = true; stats_.last_char_was_tilde = false; } txt_chs[txt_index] = '\0'; map_chs[txt_index] = '\0'; ep_chars[ep_chars_index] = '\0'; // terminate string word->ep_choice = new WERD_CHOICE(ep_chars, uchset); if (force_eol) stats_.write_results_empty_block = true; return; } /* NORMAL PROCESSING of non tilde crunched words */ stats_.tilde_crunch_written = false; if (newline_type) stats_.last_char_was_newline = true; else stats_.last_char_was_newline = false; stats_.write_results_empty_block = force_eol; // about to write a real word if (unlv_tilde_crunching && stats_.last_char_was_tilde && (word->word->space() == 0) && !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) && (word->best_choice->unichar_id(0) == space)) { /* Prevent adjacent tilde across words - we know that adjacent tildes within words have been removed */ word->best_choice->remove_unichar_id(0); if (word->best_choice->blob_choices() != NULL) { BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices()); if (!blob_choices_it.empty()) delete blob_choices_it.extract(); } word->reject_map.remove_pos (0); word->box_word->DeleteBox(0); } if (newline_type || (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) stats_.last_char_was_tilde = false; else { if (word->reject_map.length () > 0) { if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) stats_.last_char_was_tilde = true; else stats_.last_char_was_tilde = false; } else if (word->word->space () > 0) stats_.last_char_was_tilde = false; /* else it is unchanged as there are no output chars */ } ASSERT_HOST (word->best_choice->length() == word->reject_map.length()); set_unlv_suspects(word); check_debug_pt (word, 120); if (tessedit_rejection_debug) { tprintf ("Dict word: \"%s\": %d\n", word->best_choice->debug_string().string(), dict_word(*(word->best_choice))); } if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) { repetition_code = "|^~R"; wordstr_lengths = "\001\001\001\001"; repetition_code += uchset.id_to_unichar(get_rep_char(word)); wordstr_lengths += strlen(uchset.id_to_unichar(get_rep_char(word))); wordstr = &repetition_code; } else { if (tessedit_zero_rejection) { /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ for (i = 0; i < word->best_choice->length(); ++i) { if (word->reject_map[i].rejected()) word->reject_map[i].setrej_minimal_rej_accept(); } } if (tessedit_minimal_rejection) { /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ for (i = 0; i < word->best_choice->length(); ++i) { if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected()) word->reject_map[i].setrej_minimal_rej_accept(); } } } }
// Segment the page according to the current value of tessedit_pageseg_mode. // If the pix_binary_ member is not NULL, it is used as the source image, // and copied to image, otherwise it just uses image as the input. // On return the blocks list owns all the constructed page layout. int Tesseract::SegmentPage(const STRING* input_file, IMAGE* image, BLOCK_LIST* blocks) { int width = image->get_xsize(); int height = image->get_ysize(); int resolution = image->get_res(); #ifdef HAVE_LIBLEPT if (pix_binary_ != NULL) { width = pixGetWidth(pix_binary_); height = pixGetHeight(pix_binary_); resolution = pixGetXRes(pix_binary_); } #endif // Zero resolution messes up the algorithms, so make sure it is credible. if (resolution < kMinCredibleResolution) resolution = kDefaultResolution; // Get page segmentation mode. PageSegMode pageseg_mode = static_cast<PageSegMode>( static_cast<int>(tessedit_pageseg_mode)); // If a UNLV zone file can be found, use that instead of segmentation. if (pageseg_mode != tesseract::PSM_AUTO && input_file != NULL && input_file->length() > 0) { STRING name = *input_file; const char* lastdot = strrchr(name.string(), '.'); if (lastdot != NULL) name[lastdot - name.string()] = '\0'; read_unlv_file(name, width, height, blocks); } bool single_column = pageseg_mode > PSM_AUTO; if (blocks->empty()) { // No UNLV file present. Work according to the PageSegMode. // First make a single block covering the whole image. BLOCK_IT block_it(blocks); BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height); block_it.add_to_end(block); } else { // UNLV file present. Use PSM_SINGLE_COLUMN. pageseg_mode = PSM_SINGLE_COLUMN; } TO_BLOCK_LIST land_blocks, port_blocks; TBOX page_box; if (pageseg_mode <= PSM_SINGLE_COLUMN) { if (AutoPageSeg(width, height, resolution, single_column, image, blocks, &port_blocks) < 0) { return -1; } // To create blobs from the image region bounds uncomment this line: // port_blocks.clear(); // Uncomment to go back to the old mode. } else { #if HAVE_LIBLEPT image->FromPix(pix_binary_); #endif deskew_ = FCOORD(1.0f, 0.0f); reskew_ = FCOORD(1.0f, 0.0f); } if (blocks->empty()) { tprintf("Empty page\n"); return 0; // AutoPageSeg found an empty page. } if (port_blocks.empty()) { // AutoPageSeg was not used, so we need to find_components first. find_components(blocks, &land_blocks, &port_blocks, &page_box); } else { // AutoPageSeg does not need to find_components as it did that already. page_box.set_left(0); page_box.set_bottom(0); page_box.set_right(width); page_box.set_top(height); // Filter_blobs sets up the TO_BLOCKs the same as find_components does. filter_blobs(page_box.topright(), &port_blocks, true); } TO_BLOCK_IT to_block_it(&port_blocks); ASSERT_HOST(!port_blocks.empty()); TO_BLOCK* to_block = to_block_it.data(); if (pageseg_mode <= PSM_SINGLE_BLOCK || to_block->line_size < 2) { // For now, AUTO, SINGLE_COLUMN and SINGLE_BLOCK all map to the old // textord. The difference is the number of blocks and how the are made. textord_page(page_box.topright(), blocks, &land_blocks, &port_blocks, this); } else { // SINGLE_LINE, SINGLE_WORD and SINGLE_CHAR all need a single row. float gradient = make_single_row(page_box.topright(), to_block, &port_blocks, this); if (pageseg_mode == PSM_SINGLE_LINE) { // SINGLE_LINE uses the old word maker on the single line. make_words(page_box.topright(), gradient, blocks, &land_blocks, &port_blocks, this); } else { // SINGLE_WORD and SINGLE_CHAR cram all the blobs into a // single word, and in SINGLE_CHAR mode, all the outlines // go in a single blob. make_single_word(pageseg_mode == PSM_SINGLE_CHAR, to_block->get_rows(), to_block->block->row_list()); } } return 0; }
// Auto page segmentation. Divide the page image into blocks of uniform // text linespacing and images. // Width, height and resolution are derived from the input image. // If the pix is non-NULL, then it is assumed to be the input, and it is // copied to the image, otherwise the image is used directly. // The output goes in the blocks list with corresponding TO_BLOCKs in the // to_blocks list. // If single_column is true, then no attempt is made to divide the image // into columns, but multiple blocks are still made if the text is of // non-uniform linespacing. int Tesseract::AutoPageSeg(int width, int height, int resolution, bool single_column, IMAGE* image, BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks) { int vertical_x = 0; int vertical_y = 1; TabVector_LIST v_lines; TabVector_LIST h_lines; ICOORD bleft(0, 0); Boxa* boxa = NULL; Pixa* pixa = NULL; // The blocks made by the ColumnFinder. Moved to blocks before return. BLOCK_LIST found_blocks; #ifdef HAVE_LIBLEPT if (pix_binary_ != NULL) { if (textord_debug_images) { Pix* grey_pix = pixCreate(width, height, 8); // Printable images are light grey on white, but for screen display // they are black on dark grey so the other colors show up well. if (textord_debug_printable) { pixSetAll(grey_pix); pixSetMasked(grey_pix, pix_binary_, 192); } else { pixSetAllArbitrary(grey_pix, 64); pixSetMasked(grey_pix, pix_binary_, 0); } AlignedBlob::IncrementDebugPix(); pixWrite(AlignedBlob::textord_debug_pix().string(), grey_pix, IFF_PNG); pixDestroy(&grey_pix); } if (tessedit_dump_pageseg_images) pixWrite("tessinput.png", pix_binary_, IFF_PNG); // Leptonica is used to find the lines and image regions in the input. LineFinder::FindVerticalLines(resolution, pix_binary_, &vertical_x, &vertical_y, &v_lines); LineFinder::FindHorizontalLines(resolution, pix_binary_, &h_lines); if (tessedit_dump_pageseg_images) pixWrite("tessnolines.png", pix_binary_, IFF_PNG); ImageFinder::FindImages(pix_binary_, &boxa, &pixa); if (tessedit_dump_pageseg_images) pixWrite("tessnoimages.png", pix_binary_, IFF_PNG); // Copy the Pix to the IMAGE. image->FromPix(pix_binary_); if (single_column) v_lines.clear(); } #endif TO_BLOCK_LIST land_blocks, port_blocks; TBOX page_box; // The rest of the algorithm uses the usual connected components. find_components(blocks, &land_blocks, &port_blocks, &page_box); TO_BLOCK_IT to_block_it(&port_blocks); ASSERT_HOST(!to_block_it.empty()); for (to_block_it.mark_cycle_pt(); !to_block_it.cycled_list(); to_block_it.forward()) { TO_BLOCK* to_block = to_block_it.data(); TBOX blkbox = to_block->block->bounding_box(); if (to_block->line_size >= 2) { // Note: if there are multiple blocks, then v_lines, boxa, and pixa // are empty on the next iteration, but in this case, we assume // that there aren't any interesting line separators or images, since // it means that we have a pre-defined unlv zone file. ColumnFinder finder(static_cast<int>(to_block->line_size), blkbox.botleft(), blkbox.topright(), &v_lines, &h_lines, vertical_x, vertical_y); if (finder.FindBlocks(height, resolution, single_column, to_block, boxa, pixa, &found_blocks, to_blocks) < 0) return -1; finder.ComputeDeskewVectors(&deskew_, &reskew_); boxa = NULL; pixa = NULL; } } #ifdef HAVE_LIBLEPT boxaDestroy(&boxa); pixaDestroy(&pixa); #endif blocks->clear(); BLOCK_IT block_it(blocks); // Move the found blocks to the input/output blocks. block_it.add_list_after(&found_blocks); if (textord_debug_images) { // The debug image is no longer needed so delete it. unlink(AlignedBlob::textord_debug_pix().string()); } return 0; }
// Make the textlines and words inside each block. void Textord::TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width, int height, Pix *binary_pix, Pix *thresholds_pix, Pix *grey_pix, bool use_box_bottoms, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks) { page_tr_.set_x(width); page_tr_.set_y(height); if (to_blocks->empty()) { // AutoPageSeg was not used, so we need to find_components first. find_components(binary_pix, blocks, to_blocks); TO_BLOCK_IT it(to_blocks); for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { TO_BLOCK *to_block = it.data(); // Compute the edge offsets whether or not there is a grey_pix. // We have by-passed auto page seg, so we have to run it here. // By page segmentation mode there is no non-text to avoid running on. to_block->ComputeEdgeOffsets(thresholds_pix, grey_pix); } } else if (!PSM_SPARSE(pageseg_mode)) { // AutoPageSeg does not need to find_components as it did that already. // Filter_blobs sets up the TO_BLOCKs the same as find_components does. filter_blobs(page_tr_, to_blocks, true); } ASSERT_HOST(!to_blocks->empty()); if (pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT) { const FCOORD anticlockwise90(0.0f, 1.0f); const FCOORD clockwise90(0.0f, -1.0f); TO_BLOCK_IT it(to_blocks); for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { TO_BLOCK *to_block = it.data(); BLOCK *block = to_block->block; // Create a fake poly_block in block from its bounding box. block->set_poly_block(new POLY_BLOCK(block->bounding_box(), PT_VERTICAL_TEXT)); // Rotate the to_block along with its contained block and blobnbox lists. to_block->rotate(anticlockwise90); // Set the block's rotation values to obey the convention followed in // layout analysis for vertical text. block->set_re_rotation(clockwise90); block->set_classify_rotation(clockwise90); } } TO_BLOCK_IT to_block_it(to_blocks); TO_BLOCK *to_block = to_block_it.data(); // Make the rows in the block. float gradient = 0; // Do it the old fashioned way. if (PSM_LINE_FIND_ENABLED(pageseg_mode)) { gradient = make_rows(page_tr_, to_blocks); } else if (!PSM_SPARSE(pageseg_mode)) { // RAW_LINE, SINGLE_LINE, SINGLE_WORD and SINGLE_CHAR all need a single row. gradient = make_single_row(page_tr_, pageseg_mode != PSM_RAW_LINE, to_block, to_blocks); } BaselineDetect baseline_detector(textord_baseline_debug, reskew, to_blocks); baseline_detector.ComputeStraightBaselines(use_box_bottoms); baseline_detector.ComputeBaselineSplinesAndXheights(page_tr_, true, textord_heavy_nr, textord_show_final_rows, this); // Now make the words in the lines. if (PSM_WORD_FIND_ENABLED(pageseg_mode)) { // SINGLE_LINE uses the old word maker on the single line. make_words(this, page_tr_, gradient, blocks, to_blocks); } else { // SINGLE_WORD and SINGLE_CHAR cram all the blobs into a // single word, and in SINGLE_CHAR mode, all the outlines // go in a single blob. TO_BLOCK *to_block = to_block_it.data(); make_single_word(pageseg_mode == PSM_SINGLE_CHAR, to_block->get_rows(), to_block->block->row_list()); } cleanup_blocks(PSM_WORD_FIND_ENABLED(pageseg_mode), blocks); // Remove empties. // Compute the margins for each row in the block, to be used later for // paragraph detection. BLOCK_IT b_it(blocks); for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { b_it.data()->compute_row_margins(); } #ifndef GRAPHICS_DISABLED close_to_win(); #endif }
C_OUTLINE::C_OUTLINE( //constructor C_OUTLINE *srcline, //outline to FCOORD rotation //rotate ) { TBOX new_box; //easy bounding inT16 stepindex; //index to step inT16 dirdiff; //direction change ICOORD pos; //current position ICOORD prevpos; //previous dest point ICOORD destpos; //destination point inT16 destindex; //index to step DIR128 dir; //coded direction uinT8 new_step; stepcount = srcline->stepcount * 2; //get memory steps = (uinT8 *) alloc_mem (step_mem()); memset(steps, 0, step_mem()); for (int iteration = 0; iteration < 2; ++iteration) { DIR128 round1 = iteration == 0 ? 32 : 0; DIR128 round2 = iteration != 0 ? 32 : 0; pos = srcline->start; prevpos = pos; prevpos.rotate (rotation); start = prevpos; box = TBOX (start, start); destindex = 0; for (stepindex = 0; stepindex < srcline->stepcount; stepindex++) { pos += srcline->step (stepindex); destpos = pos; destpos.rotate (rotation); // printf("%i %i %i %i ", destpos.x(), destpos.y(), pos.x(), pos.y()); while (destpos.x () != prevpos.x () || destpos.y () != prevpos.y ()) { dir = DIR128 (FCOORD (destpos - prevpos)); dir += 64; //turn to step style new_step = dir.get_dir (); // printf(" %i\n", new_step); if (new_step & 31) { set_step(destindex++, dir + round1); prevpos += step(destindex - 1); if (destindex < 2 || ((dirdiff = step_dir (destindex - 1) - step_dir (destindex - 2)) != -64 && dirdiff != 64)) { set_step(destindex++, dir + round2); prevpos += step(destindex - 1); } else { prevpos -= step(destindex - 1); destindex--; prevpos -= step(destindex - 1); set_step(destindex - 1, dir + round2); prevpos += step(destindex - 1); } } else { set_step(destindex++, dir); prevpos += step(destindex - 1); } while (destindex >= 2 && ((dirdiff = step_dir (destindex - 1) - step_dir (destindex - 2)) == -64 || dirdiff == 64)) { prevpos -= step(destindex - 1); prevpos -= step(destindex - 2); destindex -= 2; // Forget u turn } //ASSERT_HOST(prevpos.x() == destpos.x() && prevpos.y() == destpos.y()); new_box = TBOX (destpos, destpos); box += new_box; } } ASSERT_HOST (destpos.x () == start.x () && destpos.y () == start.y ()); dirdiff = step_dir (destindex - 1) - step_dir (0); while ((dirdiff == 64 || dirdiff == -64) && destindex > 1) { start += step (0); destindex -= 2; for (int i = 0; i < destindex; ++i) set_step(i, step_dir(i + 1)); dirdiff = step_dir (destindex - 1) - step_dir (0); } if (destindex >= 4) break; } stepcount = destindex; destpos = start; for (stepindex = 0; stepindex < stepcount; stepindex++) { destpos += step (stepindex); } ASSERT_HOST (destpos.x () == start.x () && destpos.y () == start.y ()); }
/** * Segment the page according to the current value of tessedit_pageseg_mode. * pix_binary_ is used as the source image and should not be NULL. * On return the blocks list owns all the constructed page layout. */ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks, Tesseract* osd_tess, OSResults* osr) { ASSERT_HOST(pix_binary_ != NULL); int width = pixGetWidth(pix_binary_); int height = pixGetHeight(pix_binary_); // Get page segmentation mode. PageSegMode pageseg_mode = static_cast<PageSegMode>( static_cast<int>(tessedit_pageseg_mode)); // If a UNLV zone file can be found, use that instead of segmentation. if (!PSM_COL_FIND_ENABLED(pageseg_mode) && input_file != NULL && input_file->length() > 0) { STRING name = *input_file; const char* lastdot = strrchr(name.string(), '.'); if (lastdot != NULL) name[lastdot - name.string()] = '\0'; read_unlv_file(name, width, height, blocks); } if (blocks->empty()) { // No UNLV file present. Work according to the PageSegMode. // First make a single block covering the whole image. BLOCK_IT block_it(blocks); BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height); block->set_right_to_left(right_to_left()); block_it.add_to_end(block); } else { // UNLV file present. Use PSM_SINGLE_BLOCK. pageseg_mode = PSM_SINGLE_BLOCK; } // The diacritic_blobs holds noise blobs that may be diacritics. They // are separated out on areas of the image that seem noisy and short-circuit // the layout process, going straight from the initial partition creation // right through to after word segmentation, where they are added to the // rej_cblobs list of the most appropriate word. From there classification // will determine whether they are used. BLOBNBOX_LIST diacritic_blobs; int auto_page_seg_ret_val = 0; TO_BLOCK_LIST to_blocks; if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) || PSM_SPARSE(pageseg_mode)) { auto_page_seg_ret_val = AutoPageSeg( pageseg_mode, blocks, &to_blocks, enable_noise_removal ? &diacritic_blobs : NULL, osd_tess, osr); if (pageseg_mode == PSM_OSD_ONLY) return auto_page_seg_ret_val; // To create blobs from the image region bounds uncomment this line: // to_blocks.clear(); // Uncomment to go back to the old mode. } else { deskew_ = FCOORD(1.0f, 0.0f); reskew_ = FCOORD(1.0f, 0.0f); if (pageseg_mode == PSM_CIRCLE_WORD) { Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_); if (pixcleaned != NULL) { pixDestroy(&pix_binary_); pix_binary_ = pixcleaned; } } } if (auto_page_seg_ret_val < 0) { return -1; } if (blocks->empty()) { if (textord_debug_tabfind) tprintf("Empty page\n"); return 0; // AutoPageSeg found an empty page. } bool splitting = pageseg_devanagari_split_strategy != ShiroRekhaSplitter::NO_SPLIT; bool cjk_mode = textord_use_cjk_fp_model; textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_, pix_thresholds_, pix_grey_, splitting || cjk_mode, &diacritic_blobs, blocks, &to_blocks); return auto_page_seg_ret_val; }
/** * word_display() Word Processor * * Display a word according to its display modes */ BOOL8 Tesseract::word_display(PAGE_RES_IT* pr_it) { WERD_RES* word_res = pr_it->word(); WERD* word = word_res->word; TBOX word_bb; // word bounding box int word_height; // ht of word BB BOOL8 displayed_something = FALSE; float shift; // from bot left C_BLOB_IT c_it; // cblob iterator if (color_mode != CM_RAINBOW && word_res->box_word != NULL) { BoxWord* box_word = word_res->box_word; WERD_CHOICE* best_choice = word_res->best_choice; int length = box_word->length(); if (word_res->fontinfo == NULL) return false; const FontInfo& font_info = *word_res->fontinfo; for (int i = 0; i < length; ++i) { ScrollView::Color color = ScrollView::GREEN; switch (color_mode) { case CM_SUBSCRIPT: if (best_choice->BlobPosition(i) == SP_SUBSCRIPT) color = ScrollView::RED; break; case CM_SUPERSCRIPT: if (best_choice->BlobPosition(i) == SP_SUPERSCRIPT) color = ScrollView::RED; break; case CM_ITALIC: if (font_info.is_italic()) color = ScrollView::RED; break; case CM_BOLD: if (font_info.is_bold()) color = ScrollView::RED; break; case CM_FIXEDPITCH: if (font_info.is_fixed_pitch()) color = ScrollView::RED; break; case CM_SERIF: if (font_info.is_serif()) color = ScrollView::RED; break; case CM_SMALLCAPS: if (word_res->small_caps) color = ScrollView::RED; break; case CM_DROPCAPS: if (best_choice->BlobPosition(i) == SP_DROPCAP) color = ScrollView::RED; break; // TODO(rays) underline is currently completely unsupported. case CM_UNDERLINE: default: break; } image_win->Pen(color); TBOX box = box_word->BlobBox(i); image_win->Rectangle(box.left(), box.bottom(), box.right(), box.top()); } return true; } /* Note the double coercions of(COLOUR)((inT32)editor_image_word_bb_color) etc. are to keep the compiler happy. */ // display bounding box if (word->display_flag(DF_BOX)) { word->bounding_box().plot(image_win, (ScrollView::Color)((inT32) editor_image_word_bb_color), (ScrollView::Color)((inT32) editor_image_word_bb_color)); ScrollView::Color c = (ScrollView::Color) ((inT32) editor_image_blob_bb_color); image_win->Pen(c); c_it.set_to_list(word->cblob_list()); for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) c_it.data()->bounding_box().plot(image_win); displayed_something = TRUE; } // display edge steps if (word->display_flag(DF_EDGE_STEP)) { // edgesteps available word->plot(image_win); // rainbow colors displayed_something = TRUE; } // display poly approx if (word->display_flag(DF_POLYGONAL)) { // need to convert TWERD* tword = TWERD::PolygonalCopy(poly_allow_detailed_fx, word); tword->plot(image_win); delete tword; displayed_something = TRUE; } // Display correct text and blamer information. STRING text; STRING blame; if (word->display_flag(DF_TEXT) && word->text() != NULL) { text = word->text(); } if (word->display_flag(DF_BLAMER) && !(word_res->blamer_bundle != NULL && word_res->blamer_bundle->incorrect_result_reason() == IRR_CORRECT)) { text = ""; const BlamerBundle *blamer_bundle = word_res->blamer_bundle; if (blamer_bundle == NULL) { text += "NULL"; } else { text = blamer_bundle->TruthString(); } text += " -> "; STRING best_choice_str; if (word_res->best_choice == NULL) { best_choice_str = "NULL"; } else { word_res->best_choice->string_and_lengths(&best_choice_str, NULL); } text += best_choice_str; IncorrectResultReason reason = (blamer_bundle == NULL) ? IRR_PAGE_LAYOUT : blamer_bundle->incorrect_result_reason(); ASSERT_HOST(reason < IRR_NUM_REASONS) blame += " ["; blame += BlamerBundle::IncorrectReasonName(reason); blame += "]"; } if (text.length() > 0) { word_bb = word->bounding_box(); image_win->Pen(ScrollView::RED); word_height = word_bb.height(); int text_height = 0.50 * word_height; if (text_height > 20) text_height = 20; image_win->TextAttributes("Arial", text_height, false, false, false); shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f; image_win->Text(word_bb.left() + shift, word_bb.bottom() + 0.25 * word_height, text.string()); if (blame.length() > 0) { image_win->Text(word_bb.left() + shift, word_bb.bottom() + 0.25 * word_height - text_height, blame.string()); } displayed_something = TRUE; } if (!displayed_something) // display BBox anyway word->bounding_box().plot(image_win, (ScrollView::Color)((inT32) editor_image_word_bb_color), (ScrollView::Color)((inT32) editor_image_word_bb_color)); return TRUE; }
/** * rebuild_current_state * * Evaluate the segmentation that is represented by this state in the * best first search. Add this state to the "states_seen" list. */ BLOB_CHOICE_LIST_VECTOR *Wordrec::rebuild_current_state( TBLOB *blobs, SEAMS seam_list, STATE *state, BLOB_CHOICE_LIST_VECTOR *old_choices, int fx, bool force_rebuild, const WERD_CHOICE &best_choice, const MATRIX *ratings) { // Initialize search_state, num_joints, x, y. int num_joints = array_count(seam_list); #ifndef GRAPHICS_DISABLED if (wordrec_display_segmentations) { print_state("Rebuiling state", state, num_joints); } #endif SEARCH_STATE search_state = bin_to_chunks(state, num_joints); int x = 0; int y; int i; for (i = 1; i <= search_state[0]; i++) { y = x + search_state[i]; x = y + 1; } y = count_blobs (blobs) - 1; // Initialize char_choices, expanded_fragment_lengths: // e.g. if fragment_lengths = {1 1 2 3 1}, // expanded_fragment_lengths_str = {1 1 2 2 3 3 3 1}. BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR(); STRING expanded_fragment_lengths_str = ""; bool state_has_fragments = false; const char *fragment_lengths = NULL; if (best_choice.length() > 0) { fragment_lengths = best_choice.fragment_lengths(); } if (fragment_lengths) { for (int i = 0; i < best_choice.length(); ++i) { *char_choices += NULL; if (fragment_lengths[i] > 1) { state_has_fragments = true; } for (int j = 0; j < fragment_lengths[i]; ++j) { expanded_fragment_lengths_str += fragment_lengths[i]; } } } else { for (i = 0; i <= search_state[0]; ++i) { expanded_fragment_lengths_str += (char)1; *char_choices += NULL; } } // Finish early if force_rebuld is false and there are no fragments to merge. if (!force_rebuild && !state_has_fragments) { delete char_choices; memfree(search_state); return old_choices; } // Set up variables for concatenating fragments. const char *word_lengths_ptr = NULL; const char *word_ptr = NULL; if (state_has_fragments) { // Make word_lengths_ptr point to the last element in // best_choice->unichar_lengths(). word_lengths_ptr = best_choice.unichar_lengths().string(); word_lengths_ptr += (strlen(word_lengths_ptr)-1); // Make word_str point to the beginning of the last // unichar in best_choice->unichar_string(). word_ptr = best_choice.unichar_string().string(); word_ptr += (strlen(word_ptr)-*word_lengths_ptr); } const char *expanded_fragment_lengths = expanded_fragment_lengths_str.string(); bool merging_fragment = false; int true_y = -1; char unichar[UNICHAR_LEN + 1]; int fragment_pieces = -1; float rating = 0.0; float certainty = -MAX_FLOAT32; // Populate char_choices list such that it corresponds to search_state. // // If we are rebuilding a state that contains character fragments: // -- combine blobs that belong to character fragments // -- re-classify the blobs to obtain choices list for the merged blob // -- ensure that correct classification appears in the new choices list // NOTE: a choice composed form original fragment choices will be always // added to the new choices list for each character composed from // fragments (even if the choice for the corresponding character appears // in the re-classified choices list of for the newly merged blob). BLOB_CHOICE_IT temp_it; int char_choices_index = char_choices->length() - 1; for (i = search_state[0]; i >= 0; i--) { BLOB_CHOICE_LIST *current_choices = join_blobs_and_classify( blobs, seam_list, x, y, fx, ratings, old_choices); // Combine character fragments. if (expanded_fragment_lengths[i] > 1) { // Start merging character fragments. if (!merging_fragment) { merging_fragment = true; true_y = y; fragment_pieces = expanded_fragment_lengths[i]; rating = 0.0; certainty = -MAX_FLOAT32; strncpy(unichar, word_ptr, *word_lengths_ptr); unichar[*word_lengths_ptr] = '\0'; } // Take into account the fact that we could have joined pieces // since we first recorded the ending point of a fragment (true_y). true_y -= y - x; // Populate fragment with updated values and look for the // fragment with the same values in current_choices. // Update rating and certainty of the character being composed. fragment_pieces--; CHAR_FRAGMENT fragment; fragment.set_all(unichar, fragment_pieces, expanded_fragment_lengths[i]); temp_it.set_to_list(current_choices); for (temp_it.mark_cycle_pt(); !temp_it.cycled_list(); temp_it.forward()) { const CHAR_FRAGMENT *current_fragment = getDict().getUnicharset().get_fragment(temp_it.data()->unichar_id()); if (current_fragment && fragment.equals(current_fragment)) { rating += temp_it.data()->rating(); if (temp_it.data()->certainty() > certainty) { certainty = temp_it.data()->certainty(); } break; } } assert(!temp_it.cycled_list()); // make sure we found the fragment // Free current_choices for the fragmented character. delete current_choices; // Finish composing character from fragments. if (fragment_pieces == 0) { // Populate current_choices with the classification of // the blob merged from blobs of each character fragment. current_choices = join_blobs_and_classify(blobs, seam_list, x, true_y, fx, ratings, NULL); BLOB_CHOICE *merged_choice = new BLOB_CHOICE(getDict().getUnicharset().unichar_to_id(unichar), rating, certainty, 0, NO_PERM); // Insert merged_blob into current_choices, such that current_choices // are still sorted in non-descending order by rating. ASSERT_HOST(!current_choices->empty()); temp_it.set_to_list(current_choices); for (temp_it.mark_cycle_pt(); !temp_it.cycled_list() && merged_choice->rating() > temp_it.data()->rating(); temp_it.forward()); temp_it.add_before_stay_put(merged_choice); // Done merging this fragmented character. merging_fragment = false; } } if (!merging_fragment) { // Get rid of fragments in current_choices. temp_it.set_to_list(current_choices); for (temp_it.mark_cycle_pt(); !temp_it.cycled_list(); temp_it.forward()) { if (getDict().getUnicharset().get_fragment( temp_it.data()->unichar_id())) { delete temp_it.extract(); } } char_choices->set(current_choices, char_choices_index); char_choices_index--; // Update word_ptr and word_lengths_ptr. if (word_lengths_ptr != NULL && word_ptr != NULL) { word_lengths_ptr--; word_ptr -= (*word_lengths_ptr); } } y = x - 1; x = y - search_state[i]; } old_choices->delete_data_pointers(); delete old_choices; memfree(search_state); return (char_choices); }
// Helper sets the character attribute properties and sets up the script table. // Does not set tops and bottoms. void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET* unicharset) { for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) { // Convert any custom ligatures. const char* unichar_str = unicharset->id_to_unichar(unichar_id); for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr; ++i) { if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) { unichar_str = UNICHARSET::kCustomLigatures[i][0]; break; } } // Convert the unichar to UTF32 representation std::vector<char32> uni_vector = UNICHAR::UTF8ToUTF32(unichar_str); // Assume that if the property is true for any character in the string, // then it holds for the whole "character". bool unichar_isalpha = false; bool unichar_islower = false; bool unichar_isupper = false; bool unichar_isdigit = false; bool unichar_ispunct = false; for (char32 u_ch : uni_vector) { if (u_isalpha(u_ch)) unichar_isalpha = true; if (u_islower(u_ch)) unichar_islower = true; if (u_isupper(u_ch)) unichar_isupper = true; if (u_isdigit(u_ch)) unichar_isdigit = true; if (u_ispunct(u_ch)) unichar_ispunct = true; } unicharset->set_isalpha(unichar_id, unichar_isalpha); unicharset->set_islower(unichar_id, unichar_islower); unicharset->set_isupper(unichar_id, unichar_isupper); unicharset->set_isdigit(unichar_id, unichar_isdigit); unicharset->set_ispunctuation(unichar_id, unichar_ispunct); tesseract::IcuErrorCode err; unicharset->set_script(unichar_id, uscript_getName( uscript_getScript(uni_vector[0], err))); const int num_code_points = uni_vector.size(); // Obtain the lower/upper case if needed and record it in the properties. unicharset->set_other_case(unichar_id, unichar_id); if (unichar_islower || unichar_isupper) { std::vector<char32> other_case(num_code_points, 0); for (int i = 0; i < num_code_points; ++i) { // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used. // However since they deal with UChars (so need a conversion function // from char32 or UTF8string) and require a meaningful locale string, // for now u_tolower()/u_toupper() are used. other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) : u_tolower(uni_vector[i]); } std::string other_case_uch = UNICHAR::UTF32ToUTF8(other_case); UNICHAR_ID other_case_id = unicharset->unichar_to_id(other_case_uch.c_str()); if (other_case_id != INVALID_UNICHAR_ID) { unicharset->set_other_case(unichar_id, other_case_id); } else if (unichar_id >= SPECIAL_UNICHAR_CODES_COUNT && report_errors) { tprintf("Other case %s of %s is not in unicharset\n", other_case_uch.c_str(), unichar_str); } } // Set RTL property and obtain mirror unichar ID from ICU. std::vector<char32> mirrors(num_code_points, 0); for (int i = 0; i < num_code_points; ++i) { mirrors[i] = u_charMirror(uni_vector[i]); if (i == 0) { // set directionality to that of the 1st code point unicharset->set_direction(unichar_id, static_cast<UNICHARSET::Direction>( u_charDirection(uni_vector[i]))); } } std::string mirror_uch = UNICHAR::UTF32ToUTF8(mirrors); UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str()); if (mirror_uch_id != INVALID_UNICHAR_ID) { unicharset->set_mirror(unichar_id, mirror_uch_id); } else if (report_errors) { tprintf("Mirror %s of %s is not in unicharset\n", mirror_uch.c_str(), unichar_str); } // Record normalized version of this unichar. std::string normed_str; if (unichar_id != 0 && tesseract::NormalizeUTF8String( decompose ? tesseract::UnicodeNormMode::kNFKD : tesseract::UnicodeNormMode::kNFKC, tesseract::OCRNorm::kNormalize, tesseract::GraphemeNorm::kNone, unichar_str, &normed_str) && !normed_str.empty()) { unicharset->set_normed(unichar_id, normed_str.c_str()); } else { unicharset->set_normed(unichar_id, unichar_str); } ASSERT_HOST(unicharset->get_other_case(unichar_id) < unicharset->size()); } unicharset->post_load_setup(); }
// Returns true if the given string is equivalent to the truth string for // the current word. bool LTRResultIterator::EquivalentToTruth(const char *str) const { if (!HasTruthString()) return false; ASSERT_HOST(it_->word()->uch_set != NULL); WERD_CHOICE str_wd(str, *(it_->word()->uch_set)); return it_->word()->blamer_bundle->ChoiceIsCorrect(&str_wd); }
WERD_CHOICE *split_and_recog_word( //recog one owrd WERD *word, //word to do DENORM *denorm, //de-normaliser POLY_MATCHER matcher, //matcher function POLY_TESTER tester, //tester function POLY_TESTER trainer, //trainer function BOOL8 testing, //true if answer driven WERD_CHOICE *&raw_choice, //raw result //list of blob lists BLOB_CHOICE_LIST_CLIST *blob_choices, WERD *&outword //bln word output ) { // inT32 outword1_len; // inT32 outword2_len; WERD *first_word; //poly copy of word WERD *second_word; //fabricated word WERD *outword2; //2nd output word PBLOB *blob; WERD_CHOICE *result; //resturn value WERD_CHOICE *result2; //output of 2nd word WERD_CHOICE *raw_choice2; //raw version of 2nd float gap; //blob gap float bestgap; //biggest gap PBLOB_LIST new_blobs; //list of gathered blobs PBLOB_IT blob_it; //iterator PBLOB_IT new_blob_it = &new_blobs; first_word = word->poly_copy (denorm->row ()->x_height ()); blob_it.set_to_list (first_word->blob_list ()); bestgap = -MAX_INT32; while (!blob_it.at_last ()) { blob = blob_it.data (); //gap to next gap = blob_it.data_relative (1)->bounding_box ().left () - blob->bounding_box ().right (); blob_it.forward (); if (gap > bestgap) { bestgap = gap; //find biggest new_blob_it = blob_it; //save position } } //take 2nd half new_blobs.assign_to_sublist (&new_blob_it, &blob_it); //make it a word second_word = new WERD (&new_blobs, 1, NULL); ASSERT_HOST (word->blob_list ()->length () == first_word->blob_list ()->length () + second_word->blob_list ()->length ()); result = recog_word_recursive (first_word, denorm, matcher, tester, trainer, testing, raw_choice, blob_choices, outword); delete first_word; //done that one result2 = recog_word_recursive (second_word, denorm, matcher, tester, trainer, testing, raw_choice2, blob_choices, outword2); delete second_word; //done that too *result += *result2; //combine ratings delete result2; *raw_choice += *raw_choice2; delete raw_choice2; //finished with it // outword1_len= outword->blob_list()->length(); // outword2_len= outword2->blob_list()->length(); outword->join_on (outword2); //join words delete outword2; // if ( outword->blob_list()->length() != outword1_len + outword2_len ) // tprintf( "Split&Recog: part1len=%d; part2len=%d; combinedlen=%d\n", // outword1_len, outword2_len, outword->blob_list()->length() ); // ASSERT_HOST( outword->blob_list()->length() == outword1_len + outword2_len ); return result; }
// Return the ColumnSpanningType that best explains the columns overlapped // by the given coords(left,right,y), with the given margins. // Also return the first and last column index touched by the coords and // the leftmost spanned column. // Column indices are 2n + 1 for real columns (0 based) and even values // represent the gaps in between columns, with 0 being left of the leftmost. // resolution refers to the ppi resolution of the image. ColumnSpanningType ColPartitionSet::SpanningType(int resolution, int left, int right, int y, int left_margin, int right_margin, int* first_col, int* last_col, int* first_spanned_col) { *first_col = -1; *last_col = -1; *first_spanned_col = -1; int margin_columns = 0; ColPartition_IT it(&parts_); int col_index = 1; for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), col_index += 2) { ColPartition* part = it.data(); if (part->ColumnContains(left, y)) { // In the default case, first_col is set, but columns_spanned remains // zero, so first_col will get reset in the first column genuinely // spanned, but we can tell the difference from a noise partition // that touches no column. *first_col = col_index; if (part->ColumnContains(right, y)) { // Both within a single column. *last_col = col_index; return CST_FLOWING; } if (left_margin <= part->LeftAtY(y)) { // It completely spans this column. *first_spanned_col = col_index; margin_columns = 1; } } else if (part->ColumnContains(right, y)) { if (*first_col < 0) { // It started in-between. *first_col = col_index - 1; } if (right_margin >= part->RightAtY(y)) { // It completely spans this column. if (margin_columns == 0) *first_spanned_col = col_index; ++margin_columns; } *last_col = col_index; break; } else if (left < part->LeftAtY(y) && right > part->RightAtY(y)) { // Neither left nor right are contained within, so it spans this // column. if (*first_col < 0) { // It started in between the previous column and the current column. *first_col = col_index - 1; } if (margin_columns == 0) *first_spanned_col = col_index; *last_col = col_index; } else if (right < part->LeftAtY(y)) { // We have gone past the end. *last_col = col_index - 1; if (*first_col < 0) { // It must lie completely between columns =>noise. *first_col = col_index - 1; } break; } } if (*first_col < 0) *first_col = col_index - 1; // The last in-between. if (*last_col < 0) *last_col = col_index - 1; // The last in-between. ASSERT_HOST(*first_col >= 0 && *last_col >= 0); ASSERT_HOST(*first_col <= *last_col); if (*first_col == *last_col && right - left < kMinColumnWidth * resolution) { // Neither end was in a column, and it didn't span any, so it lies // entirely between columns, therefore noise. return CST_NOISE; } else if (margin_columns <= 1) { // An exception for headings that stick outside of single-column text. if (margin_columns == 1 && parts_.singleton()) { return CST_HEADING; } // It is a pullout, as left and right were not in the same column, but // it doesn't go to the edge of its start and end. return CST_PULLOUT; } // Its margins went to the edges of first and last columns => heading. return CST_HEADING; }
/********************************************************************** * recog_word * * Convert the word to tess form and pass it to the tess segmenter. * Convert the output back to editor form. **********************************************************************/ WERD_CHOICE *recog_word( //recog one owrd WERD *word, //word to do DENORM *denorm, //de-normaliser POLY_MATCHER matcher, //matcher function POLY_TESTER tester, //tester function POLY_TESTER trainer, //trainer function BOOL8 testing, //true if answer driven WERD_CHOICE *&raw_choice, //raw result //list of blob lists BLOB_CHOICE_LIST_CLIST *blob_choices, WERD *&outword //bln word output ) { WERD_CHOICE *word_choice; uinT8 perm_type; uinT8 real_dict_perm_type; if (word->blob_list ()->empty ()) { char empty_lengths[] = {0}; word_choice = new WERD_CHOICE ("", empty_lengths, 10.0f, -1.0f, TOP_CHOICE_PERM); raw_choice = new WERD_CHOICE ("", empty_lengths, 10.0f, -1.0f, TOP_CHOICE_PERM); outword = word->poly_copy (denorm->row ()->x_height ()); } else word_choice = recog_word_recursive (word, denorm, matcher, tester, trainer, testing, raw_choice, blob_choices, outword); if ((word_choice->lengths ().length () != outword->blob_list ()->length ()) || (word_choice->lengths ().length () != blob_choices->length ())) { tprintf ("recog_word ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n", word_choice->string ().string (), word_choice->lengths ().length (), outword->blob_list ()->length (), blob_choices->length ()); } ASSERT_HOST (word_choice->lengths ().length () == outword->blob_list ()->length ()); ASSERT_HOST (word_choice->lengths ().length () == blob_choices->length ()); /* Copy any reject blobs into the outword */ outword->rej_blob_list ()->deep_copy (word->rej_blob_list ()); if (tessedit_override_permuter) { /* Override the permuter type if a straight dictionary check disagrees. */ perm_type = word_choice->permuter (); if ((perm_type != SYSTEM_DAWG_PERM) && (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) { real_dict_perm_type = dict_word (word_choice->string ().string ()); if (((real_dict_perm_type == SYSTEM_DAWG_PERM) || (real_dict_perm_type == FREQ_DAWG_PERM) || (real_dict_perm_type == USER_DAWG_PERM)) && (alpha_count (word_choice->string ().string (), word_choice->lengths ().string ()) > 0)) word_choice->set_permuter (real_dict_perm_type); //Use dict perm } if (tessedit_rejection_debug && perm_type != word_choice->permuter ()) { tprintf ("Permuter Type Flipped from %d to %d\n", perm_type, word_choice->permuter ()); } } assert ((word_choice == NULL) == (raw_choice == NULL)); return word_choice; }
void AssociateUtils::ComputeStats(int col, int row, const AssociateStats *parent_stats, int parent_path_length, bool fixed_pitch, float max_char_wh_ratio, WERD_RES *word_res, bool debug, AssociateStats *stats) { stats->Clear(); ASSERT_HOST(word_res != NULL); if (word_res->blob_widths.empty()) { return; } if (debug) { tprintf("AssociateUtils::ComputeStats() for col=%d, row=%d%s\n", col, row, fixed_pitch ? " (fixed pitch)" : ""); } float normalizing_height = kBlnXHeight; ROW* blob_row = word_res->blob_row; // TODO(rays/daria) Can unicharset.script_has_xheight be useful here? if (fixed_pitch && blob_row != NULL) { // For fixed pitch language like CJK, we use the full text height // as the normalizing factor so we are not dependent on xheight // calculation. if (blob_row->body_size() > 0.0f) { normalizing_height = word_res->denorm.y_scale() * blob_row->body_size(); } else { normalizing_height = word_res->denorm.y_scale() * (blob_row->x_height() + blob_row->ascenders()); } if (debug) { tprintf("normalizing height = %g (scale %g xheight %g ascenders %g)\n", normalizing_height, word_res->denorm.y_scale(), blob_row->x_height(), blob_row->ascenders()); } } float wh_ratio = word_res->GetBlobsWidth(col, row) / normalizing_height; if (wh_ratio > max_char_wh_ratio) stats->bad_shape = true; // Compute the gap sum for this shape. If there are only negative or only // positive gaps, record their sum in stats->gap_sum. However, if there is // a mixture, record only the sum of the positive gaps. // TODO(antonova): explain fragment. int negative_gap_sum = 0; for (int c = col; c < row; ++c) { int gap = word_res->GetBlobsGap(c); (gap > 0) ? stats->gap_sum += gap : negative_gap_sum += gap; } if (stats->gap_sum == 0) stats->gap_sum = negative_gap_sum; if (debug) { tprintf("wh_ratio=%g (max_char_wh_ratio=%g) gap_sum=%d %s\n", wh_ratio, max_char_wh_ratio, stats->gap_sum, stats->bad_shape ? "bad_shape" : ""); } // Compute shape_cost (for fixed pitch mode). if (fixed_pitch) { bool end_row = (row == (word_res->ratings->dimension() - 1)); // Ensure that the blob has gaps on the left and the right sides // (except for beginning and ending punctuation) and that there is // no cutting through ink at the blob boundaries. if (col > 0) { float left_gap = word_res->GetBlobsGap(col - 1) / normalizing_height; SEAM *left_seam = word_res->seam_array[col - 1]; if ((!end_row && left_gap < kMinGap) || left_seam->priority > 0.0f) { stats->bad_shape = true; } if (debug) { tprintf("left_gap %g, left_seam %g %s\n", left_gap, left_seam->priority, stats->bad_shape ? "bad_shape" : ""); } } float right_gap = 0.0f; if (!end_row) { right_gap = word_res->GetBlobsGap(row) / normalizing_height; SEAM *right_seam = word_res->seam_array[row]; if (right_gap < kMinGap || right_seam->priority > 0.0f) { stats->bad_shape = true; if (right_gap < kMinGap) stats->bad_fixed_pitch_right_gap = true; } if (debug) { tprintf("right_gap %g right_seam %g %s\n", right_gap, right_seam->priority, stats->bad_shape ? "bad_shape" : ""); } } // Impose additional segmentation penalties if blob widths or gaps // distribution don't fit a fixed-pitch model. // Since we only know the widths and gaps of the path explored so far, // the means and variances are computed for the path so far (not // considering characters to the right of the last character on the path). stats->full_wh_ratio = wh_ratio + right_gap; if (parent_stats != NULL) { stats->full_wh_ratio_total = (parent_stats->full_wh_ratio_total + stats->full_wh_ratio); float mean = stats->full_wh_ratio_total / static_cast<float>(parent_path_length+1); stats->full_wh_ratio_var = parent_stats->full_wh_ratio_var + pow(mean-stats->full_wh_ratio, 2); } else { stats->full_wh_ratio_total = stats->full_wh_ratio; } if (debug) { tprintf("full_wh_ratio %g full_wh_ratio_total %g full_wh_ratio_var %g\n", stats->full_wh_ratio, stats->full_wh_ratio_total, stats->full_wh_ratio_var); } stats->shape_cost = FixedPitchWidthCost(wh_ratio, right_gap, end_row, max_char_wh_ratio); // For some reason Tesseract prefers to treat the whole CJ words // as one blob when the initial segmentation is particularly bad. // This hack is to avoid favoring such states. if (col == 0 && end_row && wh_ratio > max_char_wh_ratio) { stats->shape_cost += 10; } stats->shape_cost += stats->full_wh_ratio_var; if (debug) tprintf("shape_cost %g\n", stats->shape_cost); } }
void UnicharAmbigs::LoadUnicharAmbigs(const UNICHARSET& encoder_set, TFile *ambig_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset) { int i, j; UnicharIdVector *adaption_ambigs_entry; if (debug_level) tprintf("Reading ambiguities\n"); int test_ambig_part_size; int replacement_ambig_part_size; // The space for buffer is allocated on the heap to avoid // GCC frame size warning. const int kBufferSize = 10 + 2 * kMaxAmbigStringSize; char *buffer = new char[kBufferSize]; char replacement_string[kMaxAmbigStringSize]; UNICHAR_ID test_unichar_ids[MAX_AMBIG_SIZE + 1]; int line_num = 0; int type = NOT_AMBIG; // Determine the version of the ambigs file. int version = 0; ASSERT_HOST(ambig_file->FGets(buffer, kBufferSize) != NULL && strlen(buffer) > 0); if (*buffer == 'v') { version = static_cast<int>(strtol(buffer+1, NULL, 10)); ++line_num; } else { ambig_file->Rewind(); } while (ambig_file->FGets(buffer, kBufferSize) != NULL) { chomp_string(buffer); if (debug_level > 2) tprintf("read line %s\n", buffer); ++line_num; if (!ParseAmbiguityLine(line_num, version, debug_level, encoder_set, buffer, &test_ambig_part_size, test_unichar_ids, &replacement_ambig_part_size, replacement_string, &type)) continue; // Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST. AmbigSpec *ambig_spec = new AmbigSpec(); if (!InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_ : dang_ambigs_, test_ambig_part_size, test_unichar_ids, replacement_ambig_part_size, replacement_string, type, ambig_spec, unicharset)) continue; // Update one_to_one_definite_ambigs_. if (test_ambig_part_size == 1 && replacement_ambig_part_size == 1 && type == DEFINITE_AMBIG) { if (one_to_one_definite_ambigs_[test_unichar_ids[0]] == NULL) { one_to_one_definite_ambigs_[test_unichar_ids[0]] = new UnicharIdVector(); } one_to_one_definite_ambigs_[test_unichar_ids[0]]->push_back( ambig_spec->correct_ngram_id); } // Update ambigs_for_adaption_. if (use_ambigs_for_adaption) { GenericVector<UNICHAR_ID> encoding; // Silently ignore invalid strings, as before, so it is safe to use a // universal ambigs file. if (unicharset->encode_string(replacement_string, true, &encoding, NULL, NULL)) { for (i = 0; i < test_ambig_part_size; ++i) { if (ambigs_for_adaption_[test_unichar_ids[i]] == NULL) { ambigs_for_adaption_[test_unichar_ids[i]] = new UnicharIdVector(); } adaption_ambigs_entry = ambigs_for_adaption_[test_unichar_ids[i]]; for (int r = 0; r < encoding.size(); ++r) { UNICHAR_ID id_to_insert = encoding[r]; ASSERT_HOST(id_to_insert != INVALID_UNICHAR_ID); // Add the new unichar id to adaption_ambigs_entry (only if the // vector does not already contain it) keeping it in sorted order. for (j = 0; j < adaption_ambigs_entry->size() && (*adaption_ambigs_entry)[j] > id_to_insert; ++j); if (j < adaption_ambigs_entry->size()) { if ((*adaption_ambigs_entry)[j] != id_to_insert) { adaption_ambigs_entry->insert(id_to_insert, j); } } else { adaption_ambigs_entry->push_back(id_to_insert); } } } } } } delete[] buffer; // Fill in reverse_ambigs_for_adaption from ambigs_for_adaption vector. if (use_ambigs_for_adaption) { for (i = 0; i < ambigs_for_adaption_.size(); ++i) { adaption_ambigs_entry = ambigs_for_adaption_[i]; if (adaption_ambigs_entry == NULL) continue; for (j = 0; j < adaption_ambigs_entry->size(); ++j) { UNICHAR_ID ambig_id = (*adaption_ambigs_entry)[j]; if (reverse_ambigs_for_adaption_[ambig_id] == NULL) { reverse_ambigs_for_adaption_[ambig_id] = new UnicharIdVector(); } reverse_ambigs_for_adaption_[ambig_id]->push_back(i); } } } // Print what was read from the input file. if (debug_level > 1) { for (int tbl = 0; tbl < 2; ++tbl) { const UnicharAmbigsVector &print_table = (tbl == 0) ? replace_ambigs_ : dang_ambigs_; for (i = 0; i < print_table.size(); ++i) { AmbigSpec_LIST *lst = print_table[i]; if (lst == NULL) continue; if (!lst->empty()) { tprintf("%s Ambiguities for %s:\n", (tbl == 0) ? "Replaceable" : "Dangerous", unicharset->debug_str(i).string()); } AmbigSpec_IT lst_it(lst); for (lst_it.mark_cycle_pt(); !lst_it.cycled_list(); lst_it.forward()) { AmbigSpec *ambig_spec = lst_it.data(); tprintf("wrong_ngram:"); UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, *unicharset); tprintf("correct_fragments:"); UnicharIdArrayUtils::print(ambig_spec->correct_fragments, *unicharset); } } } if (use_ambigs_for_adaption) { for (int vec_id = 0; vec_id < 2; ++vec_id) { const GenericVector<UnicharIdVector *> &vec = (vec_id == 0) ? ambigs_for_adaption_ : reverse_ambigs_for_adaption_; for (i = 0; i < vec.size(); ++i) { adaption_ambigs_entry = vec[i]; if (adaption_ambigs_entry != NULL) { tprintf("%sAmbigs for adaption for %s:\n", (vec_id == 0) ? "" : "Reverse ", unicharset->debug_str(i).string()); for (j = 0; j < adaption_ambigs_entry->size(); ++j) { tprintf("%s ", unicharset->debug_str( (*adaption_ambigs_entry)[j]).string()); } tprintf("\n"); } } } } } }
// Extracts the needed information from the CHAR_DESC_STRUCT. void TrainingSample::ExtractCharDesc(int int_feature_type, int micro_type, int cn_type, int geo_type, CHAR_DESC_STRUCT* char_desc) { // Extract the INT features. if (features_ != NULL) delete [] features_; FEATURE_SET_STRUCT* char_features = char_desc->FeatureSets[int_feature_type]; if (char_features == NULL) { tprintf("Error: no features to train on of type %s\n", kIntFeatureType); num_features_ = 0; features_ = NULL; } else { num_features_ = char_features->NumFeatures; features_ = new INT_FEATURE_STRUCT[num_features_]; for (int f = 0; f < num_features_; ++f) { features_[f].X = static_cast<uinT8>(char_features->Features[f]->Params[IntX]); features_[f].Y = static_cast<uinT8>(char_features->Features[f]->Params[IntY]); features_[f].Theta = static_cast<uinT8>(char_features->Features[f]->Params[IntDir]); features_[f].CP_misses = 0; } } // Extract the Micro features. if (micro_features_ != NULL) delete [] micro_features_; char_features = char_desc->FeatureSets[micro_type]; if (char_features == NULL) { tprintf("Error: no features to train on of type %s\n", kMicroFeatureType); num_micro_features_ = 0; micro_features_ = NULL; } else { num_micro_features_ = char_features->NumFeatures; micro_features_ = new MicroFeature[num_micro_features_]; for (int f = 0; f < num_micro_features_; ++f) { for (int d = 0; d < MFCount; ++d) { micro_features_[f][d] = char_features->Features[f]->Params[d]; } } } // Extract the CN feature. char_features = char_desc->FeatureSets[cn_type]; if (char_features == NULL) { tprintf("Error: no CN feature to train on.\n"); } else { ASSERT_HOST(char_features->NumFeatures == 1); cn_feature_[CharNormY] = char_features->Features[0]->Params[CharNormY]; cn_feature_[CharNormLength] = char_features->Features[0]->Params[CharNormLength]; cn_feature_[CharNormRx] = char_features->Features[0]->Params[CharNormRx]; cn_feature_[CharNormRy] = char_features->Features[0]->Params[CharNormRy]; } // Extract the Geo feature. char_features = char_desc->FeatureSets[geo_type]; if (char_features == NULL) { tprintf("Error: no Geo feature to train on.\n"); } else { ASSERT_HOST(char_features->NumFeatures == 1); geo_feature_[GeoBottom] = char_features->Features[0]->Params[GeoBottom]; geo_feature_[GeoTop] = char_features->Features[0]->Params[GeoTop]; geo_feature_[GeoWidth] = char_features->Features[0]->Params[GeoWidth]; } features_are_indexed_ = false; features_are_mapped_ = false; }
// Classifies the given [training] sample, writing to results. // See shapeclassifier.h for a full description. // Default implementation aborts. int ShapeClassifier::ClassifySample(const TrainingSample& sample, Pix* page_pix, int debug, int keep_this, GenericVector<ShapeRating>* results) { ASSERT_HOST("Must implement ClassifySample!" == NULL); return 0; }
void UnicharAmbigs::LoadUnicharAmbigs(FILE *AmbigFile, inT64 end_offset, UNICHARSET *unicharset) { int i; for (i = 0; i < unicharset->size(); ++i) { replace_ambigs_.push_back(NULL); dang_ambigs_.push_back(NULL); one_to_one_definite_ambigs_.push_back(NULL); } if (global_ambigs_debug_level) tprintf("Reading ambiguities\n"); int TestAmbigPartSize; int ReplacementAmbigPartSize; // Maximum line size: // 10 for sizes of ambigs, tabs, abmig type and newline // UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig // The space for buffer is allocated on the heap to avoid // GCC frame size warning. const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1); const int kBufferSize = 10 + 2 * kMaxAmbigStringSize; char *buffer = new char[kBufferSize]; char ReplacementString[kMaxAmbigStringSize]; UNICHAR_ID TestUnicharIds[MAX_AMBIG_SIZE + 1]; int line_num = 0; int type = NOT_AMBIG; // Determine the version of the ambigs file. int version = 0; ASSERT_HOST(fgets(buffer, kBufferSize, AmbigFile) != NULL && strlen(buffer) > 0); if (*buffer == 'v') { version = static_cast<int>(strtol(buffer+1, NULL, 10)); ++line_num; } else { rewind(AmbigFile); } while ((end_offset < 0 || ftell(AmbigFile) < end_offset) && fgets(buffer, kBufferSize, AmbigFile) != NULL) { chomp_string(buffer); if (global_ambigs_debug_level > 2) tprintf("read line %s\n", buffer); ++line_num; if (!ParseAmbiguityLine(line_num, version, *unicharset, buffer, &TestAmbigPartSize, TestUnicharIds, &ReplacementAmbigPartSize, ReplacementString, &type)) continue; // Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST. AmbigSpec *ambig_spec = new AmbigSpec(); InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_ : dang_ambigs_, TestAmbigPartSize, TestUnicharIds, ReplacementAmbigPartSize, ReplacementString, type, ambig_spec, unicharset); // Update one_to_one_definite_ambigs_. if (use_definite_ambigs_for_classifier && TestAmbigPartSize == 1 && ReplacementAmbigPartSize == 1 && type == DEFINITE_AMBIG) { if (one_to_one_definite_ambigs_[TestUnicharIds[0]] == NULL) { one_to_one_definite_ambigs_[TestUnicharIds[0]] = new UnicharIdVector(); } one_to_one_definite_ambigs_[TestUnicharIds[0]]->push_back( ambig_spec->correct_ngram_id); } } delete[] buffer; // Print what was read from the input file. if (global_ambigs_debug_level > 2) { for (int tbl = 0; tbl < 2; ++tbl) { const UnicharAmbigsVector &print_table = (tbl == 0) ? replace_ambigs_ : dang_ambigs_; for (i = 0; i < print_table.size(); ++i) { AmbigSpec_LIST *lst = print_table[i]; if (lst == NULL) continue; if (!lst->empty()) { tprintf("%s Ambiguities for %s:\n", (tbl == 0) ? "Replaceable" : "Dangerous", unicharset->debug_str(i).string()); } AmbigSpec_IT lst_it(lst); for (lst_it.mark_cycle_pt(); !lst_it.cycled_list(); lst_it.forward()) { AmbigSpec *ambig_spec = lst_it.data(); tprintf("wrong_ngram:"); UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, *unicharset); tprintf("correct_fragments:"); UnicharIdArrayUtils::print(ambig_spec->correct_fragments, *unicharset); } } } } }
// Adds the dw_ in other to the dw_ is *this. void WeightMatrix::AddDeltas(const WeightMatrix& other) { ASSERT_HOST(dw_.dim1() == other.dw_.dim1()); ASSERT_HOST(dw_.dim2() == other.dw_.dim2()); dw_ += other.dw_; }