Beispiel #1
0
// Returns a random number in [-range, range].
double Network::Random(double range) {
  ASSERT_HOST(randomizer_ != NULL);
  return randomizer_->SignedRand(range);
}
Beispiel #2
0
void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
                        UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
                        MATRIX *ratings) {
  int num_blobs_to_replace = 0;
  int begin_blob_index = 0;
  int i;
  // Rating and certainty for the new BLOB_CHOICE are derived from the
  // replaced choices.
  float new_rating = 0.0f;
  float new_certainty = 0.0f;
  BLOB_CHOICE* old_choice = nullptr;
  for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
    if (i >= wrong_ngram_begin_index) {
      int num_blobs = werd_choice->state(i);
      int col = begin_blob_index + num_blobs_to_replace;
      int row = col + num_blobs - 1;
      BLOB_CHOICE_LIST* choices = ratings->get(col, row);
      ASSERT_HOST(choices != nullptr);
      old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices);
      ASSERT_HOST(old_choice != nullptr);
      new_rating += old_choice->rating();
      new_certainty += old_choice->certainty();
      num_blobs_to_replace += num_blobs;
    } else {
      begin_blob_index += werd_choice->state(i);
    }
  }
  new_certainty /= wrong_ngram_size;
  // If there is no entry in the ratings matrix, add it.
  MATRIX_COORD coord(begin_blob_index,
                     begin_blob_index + num_blobs_to_replace - 1);
  if (!coord.Valid(*ratings)) {
    ratings->IncreaseBandSize(coord.row - coord.col + 1);
  }
  if (ratings->get(coord.col, coord.row) == nullptr)
    ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST);
  BLOB_CHOICE_LIST* new_choices = ratings->get(coord.col, coord.row);
  BLOB_CHOICE* choice = FindMatchingChoice(correct_ngram_id, new_choices);
  if (choice != nullptr) {
    // Already there. Upgrade if new rating better.
    if (new_rating < choice->rating())
      choice->set_rating(new_rating);
    if (new_certainty < choice->certainty())
      choice->set_certainty(new_certainty);
    // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState.
  } else {
    // Need a new choice with the correct_ngram_id.
    choice = new BLOB_CHOICE(*old_choice);
    choice->set_unichar_id(correct_ngram_id);
    choice->set_rating(new_rating);
    choice->set_certainty(new_certainty);
    choice->set_classifier(BCC_AMBIG);
    choice->set_matrix_cell(coord.col, coord.row);
    BLOB_CHOICE_IT it (new_choices);
    it.add_to_end(choice);
  }
  // Remove current unichar from werd_choice. On the last iteration
  // set the correct replacement unichar instead of removing a unichar.
  for (int replaced_count = 0; replaced_count < wrong_ngram_size;
       ++replaced_count) {
    if (replaced_count + 1 == wrong_ngram_size) {
      werd_choice->set_blob_choice(wrong_ngram_begin_index,
                                   num_blobs_to_replace, choice);
    } else {
      werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1);
    }
  }
  if (stopper_debug_level >= 1) {
      werd_choice->print("ReplaceAmbig() ");
      tprintf("Modified blob_choices: ");
      print_ratings_list("\n", new_choices, getUnicharset());
  }
}
inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score) {
  PBLOB_IT blob_it;
  inT16 blob_count;
  float noise_score[512];
  int i;
  int min_noise_blob;            //1st contender
  int max_noise_blob;            //last contender
  int non_noise_count;
  int worst_noise_blob;          //Worst blob
  float small_limit = bln_x_height * fixsp_small_outlines_size;
  float non_noise_limit = bln_x_height * 0.8;

  blob_it.set_to_list (word_res->outword->blob_list ());
  //normalised
  blob_count = blob_it.length ();
  ASSERT_HOST (blob_count <= 512);
  if (blob_count < 5)
    return -1;                   //too short to split
  /* Get the noise scores for all blobs */

  #ifndef SECURE_NAMES
  if (debug_fix_space_level > 5)
    tprintf ("FP fixspace Noise metrics for \"%s\": ",
      word_res->best_choice->string ().string ());
  #endif

  for (i = 0; i < blob_count; i++, blob_it.forward ()) {
    if (word_res->reject_map[i].accepted ())
      noise_score[i] = non_noise_limit;
    else
      noise_score[i] = blob_noise_score (blob_it.data ());

    if (debug_fix_space_level > 5)
      tprintf ("%1.1f ", noise_score[i]);
  }
  if (debug_fix_space_level > 5)
    tprintf ("\n");

  /* Now find the worst one which is far enough away from the end of the word */

  non_noise_count = 0;
  for (i = 0;
  (i < blob_count) && (non_noise_count < fixsp_non_noise_limit); i++) {
    if (noise_score[i] >= non_noise_limit)
      non_noise_count++;
  }
  if (non_noise_count < fixsp_non_noise_limit)
    return -1;
  min_noise_blob = i;

  non_noise_count = 0;
  for (i = blob_count - 1;
  (i >= 0) && (non_noise_count < fixsp_non_noise_limit); i--) {
    if (noise_score[i] >= non_noise_limit)
      non_noise_count++;
  }
  if (non_noise_count < fixsp_non_noise_limit)
    return -1;
  max_noise_blob = i;

  if (min_noise_blob > max_noise_blob)
    return -1;

  *worst_noise_score = small_limit;
  worst_noise_blob = -1;
  for (i = min_noise_blob; i <= max_noise_blob; i++) {
    if (noise_score[i] < *worst_noise_score) {
      worst_noise_blob = i;
      *worst_noise_score = noise_score[i];
    }
  }
  return worst_noise_blob;
}
// Top-level method to perform splitting based on current settings.
// Returns true if a split was actually performed.
// split_for_pageseg should be true if the splitting is being done prior to
// page segmentation. This mode uses the flag
// pageseg_devanagari_split_strategy to determine the splitting strategy.
bool ShiroRekhaSplitter::Split(bool split_for_pageseg) {
  SplitStrategy split_strategy = split_for_pageseg ? pageseg_split_strategy_ :
      ocr_split_strategy_;
  if (split_strategy == NO_SPLIT) {
    return false;  // Nothing to do.
  }
  ASSERT_HOST(split_strategy == MINIMAL_SPLIT ||
              split_strategy == MAXIMAL_SPLIT);
  ASSERT_HOST(orig_pix_);
  if (devanagari_split_debuglevel > 0) {
    tprintf("Splitting shiro-rekha ...\n");
    tprintf("Split strategy = %s\n",
            split_strategy == MINIMAL_SPLIT ? "Minimal" : "Maximal");
    tprintf("Initial pageseg available = %s\n",
            segmentation_block_list_ ? "yes" : "no");
  }
  // Create a copy of original image to store the splitting output.
  pixDestroy(&splitted_image_);
  splitted_image_ = pixCopy(NULL, orig_pix_);

  // Initialize debug image if required.
  if (devanagari_split_debugimage) {
    pixDestroy(&debug_image_);
    debug_image_ = pixConvertTo32(orig_pix_);
  }

  // Determine all connected components in the input image. A close operation
  // may be required prior to this, depending on the current settings.
  Pix* pix_for_ccs = pixClone(orig_pix_);
  if (perform_close_ && global_xheight_ != kUnspecifiedXheight &&
      !segmentation_block_list_) {
    if (devanagari_split_debuglevel > 0) {
      tprintf("Performing a global close operation..\n");
    }
    // A global measure is available for xheight, but no local information
    // exists.
    pixDestroy(&pix_for_ccs);
    pix_for_ccs = pixCopy(NULL, orig_pix_);
    PerformClose(pix_for_ccs, global_xheight_);
  }
  Pixa* ccs;
  Boxa* tmp_boxa = pixConnComp(pix_for_ccs, &ccs, 8);
  boxaDestroy(&tmp_boxa);
  pixDestroy(&pix_for_ccs);

  // Iterate over all connected components. Get their bounding boxes and clip
  // out the image regions corresponding to these boxes from the original image.
  // Conditionally run splitting on each of them.
  Boxa* regions_to_clear = boxaCreate(0);
  for (int i = 0; i < pixaGetCount(ccs); ++i) {
    Box* box = ccs->boxa->box[i];
    Pix* word_pix = pixClipRectangle(orig_pix_, box, NULL);
    ASSERT_HOST(word_pix);
    int xheight = GetXheightForCC(box);
    if (xheight == kUnspecifiedXheight && segmentation_block_list_ &&
        devanagari_split_debugimage) {
      pixRenderBoxArb(debug_image_, box, 1, 255, 0, 0);
    }
    // If some xheight measure is available, attempt to pre-eliminate small
    // blobs from the shiro-rekha process. This is primarily to save the CCs
    // corresponding to punctuation marks/small dots etc which are part of
    // larger graphemes.
    if (xheight == kUnspecifiedXheight ||
        (box->w > xheight / 3 && box->h > xheight / 2)) {
      SplitWordShiroRekha(split_strategy, word_pix, xheight,
                          box->x, box->y, regions_to_clear);
    } else if (devanagari_split_debuglevel > 0) {
      tprintf("CC dropped from splitting: %d,%d (%d, %d)\n",
              box->x, box->y, box->w, box->h);
    }
    pixDestroy(&word_pix);
  }
  // Actually clear the boxes now.
  for (int i = 0; i < boxaGetCount(regions_to_clear); ++i) {
    Box* box = boxaGetBox(regions_to_clear, i, L_CLONE);
    pixClearInRect(splitted_image_, box);
    boxDestroy(&box);
  }
  boxaDestroy(&regions_to_clear);
  pixaDestroy(&ccs);
  if (devanagari_split_debugimage) {
    DumpDebugImage(split_for_pageseg ? "pageseg_split_debug.png" :
                   "ocr_split_debug.png");
  }
  return true;
}
// Attempt to improve this by adding partitions or expanding partitions.
void ColPartitionSet::ImproveColumnCandidate(WidthCallback* cb,
                                             PartSetVector* src_sets) {
  int set_size = src_sets->size();
  // Iterate over the provided column sets, as each one may have something
  // to improve this.
  for (int i = 0; i < set_size; ++i) {
    ColPartitionSet* column_set = src_sets->get(i);
    if (column_set == NULL)
      continue;
    // Iterate over the parts in this and column_set, adding bigger or
    // new parts in column_set to this.
    ColPartition_IT part_it(&parts_);
    ASSERT_HOST(!part_it.empty());
    int prev_right = MIN_INT32;
    part_it.mark_cycle_pt();
    ColPartition_IT col_it(&column_set->parts_);
    for (col_it.mark_cycle_pt(); !col_it.cycled_list(); col_it.forward()) {
      ColPartition* col_part = col_it.data();
      if (col_part->blob_type() < BRT_UNKNOWN)
        continue;  // Ignore image partitions.
      int col_left = col_part->left_key();
      int col_right = col_part->right_key();
      // Sync-up part_it (in this) so it matches the col_part in column_set.
      ColPartition* part = part_it.data();
      while (!part_it.at_last() && part->right_key() < col_left) {
        prev_right = part->right_key();
        part_it.forward();
        part = part_it.data();
      }
      int part_left = part->left_key();
      int part_right = part->right_key();
      if (part_right < col_left || col_right < part_left) {
        // There is no overlap so this is a new partition.
        AddPartition(col_part->ShallowCopy(), &part_it);
        continue;
      }
      // Check the edges of col_part to see if they can improve part.
      bool part_width_ok = cb->Run(part->KeyWidth(part_left, part_right));
      if (col_left < part_left && col_left > prev_right) {
        // The left edge of the column is better and it doesn't overlap,
        // so we can potentially expand it.
        int col_box_left = col_part->BoxLeftKey();
        bool tab_width_ok = cb->Run(part->KeyWidth(col_left, part_right));
        bool box_width_ok = cb->Run(part->KeyWidth(col_box_left, part_right));
        if (tab_width_ok || (!part_width_ok )) {
          // The tab is leaving the good column metric at least as good as
          // it was before, so use the tab.
          part->CopyLeftTab(*col_part, false);
          part->SetColumnGoodness(cb);
        } else if (col_box_left < part_left &&
                   (box_width_ok || !part_width_ok)) {
          // The box is leaving the good column metric at least as good as
          // it was before, so use the box.
          part->CopyLeftTab(*col_part, true);
          part->SetColumnGoodness(cb);
        }
        part_left = part->left_key();
      }
      if (col_right > part_right &&
          (part_it.at_last() ||
           part_it.data_relative(1)->left_key() > col_right)) {
        // The right edge is better, so we can possibly expand it.
        int col_box_right = col_part->BoxRightKey();
        bool tab_width_ok = cb->Run(part->KeyWidth(part_left, col_right));
        bool box_width_ok = cb->Run(part->KeyWidth(part_left, col_box_right));
        if (tab_width_ok || (!part_width_ok )) {
          // The tab is leaving the good column metric at least as good as
          // it was before, so use the tab.
          part->CopyRightTab(*col_part, false);
          part->SetColumnGoodness(cb);
        } else if (col_box_right > part_right &&
                   (box_width_ok || !part_width_ok)) {
          // The box is leaving the good column metric at least as good as
          // it was before, so use the box.
          part->CopyRightTab(*col_part, true);
          part->SetColumnGoodness(cb);
        }
      }
    }
  }
  ComputeCoverage();
}
Beispiel #6
0
/**
 * Sets up auto page segmentation, determines the orientation, and corrects it.
 * Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to
 * facilitate testing.
 * photo_mask_pix is a pointer to a NULL pointer that will be filled on return
 * with the leptonica photo mask, which must be pixDestroyed by the caller.
 * to_blocks is an empty list that will be filled with (usually a single)
 * block that is used during layout analysis. This ugly API is required
 * because of the possibility of a unlv zone file.
 * TODO(rays) clean this up.
 * See AutoPageSeg for other arguments.
 * The returned ColumnFinder must be deleted after use.
 */
ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
    PageSegMode pageseg_mode, BLOCK_LIST* blocks, Tesseract* osd_tess,
    OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix,
    Pix** music_mask_pix) {
  int vertical_x = 0;
  int vertical_y = 1;
  TabVector_LIST v_lines;
  TabVector_LIST h_lines;
  ICOORD bleft(0, 0);

  ASSERT_HOST(pix_binary_ != NULL);
  if (tessedit_dump_pageseg_images) {
    pixa_debug_.AddPix(pix_binary_, "PageSegInput");
  }
  // Leptonica is used to find the rule/separator lines in the input.
  LineFinder::FindAndRemoveLines(source_resolution_,
                                 textord_tabfind_show_vlines, pix_binary_,
                                 &vertical_x, &vertical_y, music_mask_pix,
                                 &v_lines, &h_lines);
  if (tessedit_dump_pageseg_images) {
    pixa_debug_.AddPix(pix_binary_, "NoLines");
  }
  // Leptonica is used to find a mask of the photo regions in the input.
  *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);
  if (tessedit_dump_pageseg_images) {
    pixa_debug_.AddPix(pix_binary_, "NoImages");
  }
  if (!PSM_COL_FIND_ENABLED(pageseg_mode)) v_lines.clear();

  // The rest of the algorithm uses the usual connected components.
  textord_.find_components(pix_binary_, blocks, to_blocks);

  TO_BLOCK_IT to_block_it(to_blocks);
  // There must be exactly one input block.
  // TODO(rays) handle new textline finding with a UNLV zone file.
  ASSERT_HOST(to_blocks->singleton());
  TO_BLOCK* to_block = to_block_it.data();
  TBOX blkbox = to_block->block->bounding_box();
  ColumnFinder* finder = NULL;
  int estimated_resolution = source_resolution_;
  if (source_resolution_ == kMinCredibleResolution) {
    // Try to estimate resolution from typical body text size.
    int res = IntCastRounded(to_block->line_size * kResolutionEstimationFactor);
    if (res > estimated_resolution && res < kMaxCredibleResolution) {
      estimated_resolution = res;
      tprintf("Estimating resolution as %d\n", estimated_resolution);
    }
  }

  if (to_block->line_size >= 2) {
    finder = new ColumnFinder(static_cast<int>(to_block->line_size),
                              blkbox.botleft(), blkbox.topright(),
                              estimated_resolution, textord_use_cjk_fp_model,
                              textord_tabfind_aligned_gap_fraction, &v_lines,
                              &h_lines, vertical_x, vertical_y);

    finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block);

    if (equ_detect_) {
      equ_detect_->LabelSpecialText(to_block);
    }

    BLOBNBOX_CLIST osd_blobs;
    // osd_orientation is the number of 90 degree rotations to make the
    // characters upright. (See osdetect.h for precise definition.)
    // We want the text lines horizontal, (vertical text indicates vertical
    // textlines) which may conflict (eg vertically written CJK).
    int osd_orientation = 0;
    bool vertical_text = textord_tabfind_force_vertical_text ||
                         pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;
    if (!vertical_text && textord_tabfind_vertical_text &&
        PSM_ORIENTATION_ENABLED(pageseg_mode)) {
      vertical_text =
          finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio,
                                          to_block, &osd_blobs);
    }
    if (PSM_OSD_ENABLED(pageseg_mode) && osd_tess != NULL && osr != NULL) {
      GenericVector<int> osd_scripts;
      if (osd_tess != this) {
        // We are running osd as part of layout analysis, so constrain the
        // scripts to those allowed by *this.
        AddAllScriptsConverted(unicharset, osd_tess->unicharset, &osd_scripts);
        for (int s = 0; s < sub_langs_.size(); ++s) {
          AddAllScriptsConverted(sub_langs_[s]->unicharset,
                                 osd_tess->unicharset, &osd_scripts);
        }
      }
      os_detect_blobs(&osd_scripts, &osd_blobs, osr, osd_tess);
      if (pageseg_mode == PSM_OSD_ONLY) {
        delete finder;
        return NULL;
      }
      osd_orientation = osr->best_result.orientation_id;
      double osd_score = osr->orientations[osd_orientation];
      double osd_margin = min_orientation_margin * 2;
      for (int i = 0; i < 4; ++i) {
        if (i != osd_orientation &&
            osd_score - osr->orientations[i] < osd_margin) {
          osd_margin = osd_score - osr->orientations[i];
        }
      }
      int best_script_id = osr->best_result.script_id;
      const char* best_script_str =
          osd_tess->unicharset.get_script_from_script_id(best_script_id);
      bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||
          best_script_id == osd_tess->unicharset.hiragana_sid() ||
          best_script_id == osd_tess->unicharset.katakana_sid() ||
          strcmp("Japanese", best_script_str) == 0 ||
          strcmp("Korean", best_script_str) == 0 ||
          strcmp("Hangul", best_script_str) == 0;
      if (cjk) {
        finder->set_cjk_script(true);
      }
      if (osd_margin < min_orientation_margin) {
        // The margin is weak.
        if (!cjk && !vertical_text && osd_orientation == 2) {
          // upside down latin text is improbable with such a weak margin.
          tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: "
                  "Don't rotate.\n", osd_margin);
          osd_orientation = 0;
        } else {
          tprintf(
              "OSD: Weak margin (%.2f) for %d blob text block, "
              "but using orientation anyway: %d\n",
              osd_margin, osd_blobs.length(), osd_orientation);
        }
      }
    }
    osd_blobs.shallow_clear();
    finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
  }

  return finder;
}
Beispiel #7
0
/**
 * This routine reads a textual description of a prototype from
 * the specified file.
 *
 * Exceptions:
 * - ILLEGALSIGNIFICANCESPEC
 * - ILLEGALSAMPLECOUNT
 * - ILLEGALMEANSPEC
 * - ILLEGALVARIANCESPEC
 * - ILLEGALDISTRIBUTION
 * @param File open text file to read prototype from
 * @param N number of dimensions used in prototype
 * @return List of prototypes
 * @note Globals: None
 * @note History: 6/6/89, DSJ, Created.
 */
PROTOTYPE *ReadPrototype(FILE *File, uinT16 N) {
  char Token[TOKENSIZE];
  int Status;
  PROTOTYPE *Proto;
  int SampleCount;
  int i;

  if ((Status = tfscanf(File, "%s", Token)) == 1) {
    Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE));
    Proto->Cluster = NULL;
    if (Token[0] == 's')
      Proto->Significant = TRUE;
    else
      Proto->Significant = FALSE;

    Proto->Style = ReadProtoStyle (File);

    if ((tfscanf(File, "%d", &SampleCount) != 1) || (SampleCount < 0))
      DoError (ILLEGALSAMPLECOUNT, "Illegal sample count");
    Proto->NumSamples = SampleCount;

    Proto->Mean = ReadNFloats (File, N, NULL);
    if (Proto->Mean == NULL)
      DoError (ILLEGALMEANSPEC, "Illegal prototype mean");

    switch (Proto->Style) {
      case spherical:
        if (ReadNFloats (File, 1, &(Proto->Variance.Spherical)) == NULL)
          DoError (ILLEGALVARIANCESPEC, "Illegal prototype variance");
        Proto->Magnitude.Spherical =
          1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical));
        Proto->TotalMagnitude =
          pow (Proto->Magnitude.Spherical, (float) N);
        Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
        Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
        Proto->Distrib = NULL;
        break;
      case elliptical:
        Proto->Variance.Elliptical = ReadNFloats (File, N, NULL);
        if (Proto->Variance.Elliptical == NULL)
          DoError (ILLEGALVARIANCESPEC, "Illegal prototype variance");
        Proto->Magnitude.Elliptical =
          (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
        Proto->Weight.Elliptical =
          (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
        Proto->TotalMagnitude = 1.0;
        for (i = 0; i < N; i++) {
          Proto->Magnitude.Elliptical[i] =
            1.0 /
            sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i]));
          Proto->Weight.Elliptical[i] =
            1.0 / Proto->Variance.Elliptical[i];
          Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
        }
        Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
        Proto->Distrib = NULL;
        break;
      case mixed:
        Proto->Distrib =
          (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION));
        for (i = 0; i < N; i++) {
          if (tfscanf(File, "%s", Token) != 1)
            DoError (ILLEGALDISTRIBUTION,
              "Illegal prototype distribution");
          switch (Token[0]) {
            case 'n':
              Proto->Distrib[i] = normal;
              break;
            case 'u':
              Proto->Distrib[i] = uniform;
              break;
            case 'r':
              Proto->Distrib[i] = D_random;
              break;
            default:
              DoError (ILLEGALDISTRIBUTION,
                "Illegal prototype distribution");
          }
        }
        Proto->Variance.Elliptical = ReadNFloats (File, N, NULL);
        if (Proto->Variance.Elliptical == NULL)
          DoError (ILLEGALVARIANCESPEC, "Illegal prototype variance");
        Proto->Magnitude.Elliptical =
          (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
        Proto->Weight.Elliptical =
          (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
        Proto->TotalMagnitude = 1.0;
        for (i = 0; i < N; i++) {
          switch (Proto->Distrib[i]) {
            case normal:
              Proto->Magnitude.Elliptical[i] = 1.0 /
                sqrt ((double)
                (2.0 * PI * Proto->Variance.Elliptical[i]));
              Proto->Weight.Elliptical[i] =
                1.0 / Proto->Variance.Elliptical[i];
              break;
            case uniform:
            case D_random:
              Proto->Magnitude.Elliptical[i] = 1.0 /
                (2.0 * Proto->Variance.Elliptical[i]);
              break;
            case DISTRIBUTION_COUNT:
              ASSERT_HOST(!"Distribution count not allowed!");
          }
          Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
        }
        Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
        break;
    }
    return (Proto);
  }
  else if (Status == EOF)
    return (NULL);
  else {
    DoError (ILLEGALSIGNIFICANCESPEC, "Illegal significance specification");
    return (NULL);
  }
}
Beispiel #8
0
/**
 * Sets up auto page segmentation, determines the orientation, and corrects it.
 * Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to
 * facilitate testing.
 * photo_mask_pix is a pointer to a NULL pointer that will be filled on return
 * with the leptonica photo mask, which must be pixDestroyed by the caller.
 * to_blocks is an empty list that will be filled with (usually a single)
 * block that is used during layout analysis. This ugly API is required
 * because of the possibility of a unlv zone file.
 * TODO(rays) clean this up.
 * See AutoPageSeg for other arguments.
 * The returned ColumnFinder must be deleted after use.
 */
ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
    bool single_column, bool osd, bool only_osd,
    BLOCK_LIST* blocks, Tesseract* osd_tess, OSResults* osr,
    TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix, Pix** music_mask_pix) {
  int vertical_x = 0;
  int vertical_y = 1;
  TabVector_LIST v_lines;
  TabVector_LIST h_lines;
  ICOORD bleft(0, 0);

  ASSERT_HOST(pix_binary_ != NULL);
  if (tessedit_dump_pageseg_images) {
    pixWrite("tessinput.png", pix_binary_, IFF_PNG);
  }
  // Leptonica is used to find the rule/separator lines in the input.
  LineFinder::FindAndRemoveLines(source_resolution_,
                                 textord_tabfind_show_vlines, pix_binary_,
                                 &vertical_x, &vertical_y, music_mask_pix,
                                 &v_lines, &h_lines);
  if (tessedit_dump_pageseg_images)
    pixWrite("tessnolines.png", pix_binary_, IFF_PNG);
  // Leptonica is used to find a mask of the photo regions in the input.
  *photo_mask_pix = ImageFind::FindImages(pix_binary_);
  if (tessedit_dump_pageseg_images)
    pixWrite("tessnoimages.png", pix_binary_, IFF_PNG);
  if (single_column)
    v_lines.clear();

  // The rest of the algorithm uses the usual connected components.
  textord_.find_components(pix_binary_, blocks, to_blocks);

  TO_BLOCK_IT to_block_it(to_blocks);
  // There must be exactly one input block.
  // TODO(rays) handle new textline finding with a UNLV zone file.
  ASSERT_HOST(to_blocks->singleton());
  TO_BLOCK* to_block = to_block_it.data();
  TBOX blkbox = to_block->block->bounding_box();
  ColumnFinder* finder = NULL;

  if (to_block->line_size >= 2) {
    finder = new ColumnFinder(static_cast<int>(to_block->line_size),
                              blkbox.botleft(), blkbox.topright(),
                              source_resolution_,
                              &v_lines, &h_lines, vertical_x, vertical_y);

    finder->SetupAndFilterNoise(*photo_mask_pix, to_block);

    if (equ_detect_) {
      equ_detect_->LabelSpecialText(to_block);
    }

    BLOBNBOX_CLIST osd_blobs;
    // osd_orientation is the number of 90 degree rotations to make the
    // characters upright. (See osdetect.h for precise definition.)
    // We want the text lines horizontal, (vertical text indicates vertical
    // textlines) which may conflict (eg vertically written CJK).
    int osd_orientation = 0;
    bool vertical_text = finder->IsVerticallyAlignedText(to_block, &osd_blobs);
    if (osd && osd_tess != NULL && osr != NULL) {
      os_detect_blobs(&osd_blobs, osr, osd_tess);
      if (only_osd) {
        delete finder;
        return NULL;
      }
      osd_orientation = osr->best_result.orientation_id;
      double osd_score = osr->orientations[osd_orientation];
      double osd_margin = min_orientation_margin * 2;
      for (int i = 0; i < 4; ++i) {
        if (i != osd_orientation &&
            osd_score - osr->orientations[i] < osd_margin) {
          osd_margin = osd_score - osr->orientations[i];
        }
      }
      if (osd_margin < min_orientation_margin) {
        // The margin is weak.
        int best_script_id = osr->best_result.script_id;
        bool cjk = (best_script_id == osd_tess->unicharset.han_sid()) ||
            (best_script_id == osd_tess->unicharset.hiragana_sid()) ||
            (best_script_id == osd_tess->unicharset.katakana_sid());

        if (!cjk && !vertical_text && osd_orientation == 2) {
          // upside down latin text is improbable with such a weak margin.
          tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: "
                  "Don't rotate.\n", osd_margin);
          osd_orientation = 0;
        } else {
          tprintf("OSD: Weak margin (%.2f) for %d blob text block, "
                  "but using orientation anyway: %d\n",
                  osd_blobs.length(), osd_margin, osd_orientation);
        }
      }
    }
    osd_blobs.shallow_clear();
    finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
  }

  return finder;
}
// Returns the mean confidence of the current object at the given level.
// The number should be interpreted as a percent probability. (0.0f-100.0f)
float LTRResultIterator::Confidence(PageIteratorLevel level) const {
  if (it_->word() == NULL) return 0.0f;  // Already at the end!
  float mean_certainty = 0.0f;
  int certainty_count = 0;
  PAGE_RES_IT res_it(*it_);
  WERD_CHOICE* best_choice = res_it.word()->best_choice;
  ASSERT_HOST(best_choice != NULL);
  switch (level) {
    case RIL_BLOCK:
      do {
        best_choice = res_it.word()->best_choice;
        ASSERT_HOST(best_choice != NULL);
        mean_certainty += best_choice->certainty();
        ++certainty_count;
        res_it.forward();
      } while (res_it.block() == res_it.prev_block());
      break;
    case RIL_PARA:
      do {
        best_choice = res_it.word()->best_choice;
        ASSERT_HOST(best_choice != NULL);
        mean_certainty += best_choice->certainty();
        ++certainty_count;
        res_it.forward();
      } while (res_it.block() == res_it.prev_block() &&
               res_it.row()->row->para() == res_it.prev_row()->row->para());
      break;
    case RIL_TEXTLINE:
      do {
        best_choice = res_it.word()->best_choice;
        ASSERT_HOST(best_choice != NULL);
        mean_certainty += best_choice->certainty();
        ++certainty_count;
        res_it.forward();
      } while (res_it.row() == res_it.prev_row());
      break;
    case RIL_WORD:
      mean_certainty += best_choice->certainty();
     ++certainty_count;
      break;
    case RIL_SYMBOL:
      BLOB_CHOICE_LIST_CLIST* choices = best_choice->blob_choices();
      if (choices != NULL) {
        BLOB_CHOICE_LIST_C_IT blob_choices_it(choices);
        for (int blob = 0; blob < blob_index_; ++blob)
          blob_choices_it.forward();
        BLOB_CHOICE_IT choice_it(blob_choices_it.data());
        for (choice_it.mark_cycle_pt();
             !choice_it.cycled_list();
             choice_it.forward()) {
          if (choice_it.data()->unichar_id() ==
              best_choice->unichar_id(blob_index_))
            break;
        }
        mean_certainty += choice_it.data()->certainty();
      } else {
        mean_certainty += best_choice->certainty();
      }
      ++certainty_count;
  }
  if (certainty_count > 0) {
    mean_certainty /= certainty_count;
    float confidence = 100 + 5 * mean_certainty;
    if (confidence < 0.0f) confidence = 0.0f;
    if (confidence > 100.0f) confidence = 100.0f;
    return confidence;
  }
  return 0.0f;
}
Beispiel #10
0
/**
 * Split input into space-separated tokens, strip trailing punctuation
 * from each, determine case properties, call UTF-8 flavor of cost
 * function on each word, and aggregate all into single mean word
 * cost.
 */
int WordUnigrams::Cost(const char_32 *key_str32,
                       LangModel *lang_mod,
                       CharSet *char_set) const {
  if (!key_str32)
    return 0;
  // convert string to UTF8 to split into space-separated words
  string key_str;
  CubeUtils::UTF32ToUTF8(key_str32, &key_str);
  vector<string> words;
  CubeUtils::SplitStringUsing(key_str, " \t", &words);

  // no words => no cost
  if (words.size() <= 0) {
    return 0;
  }

  // aggregate the costs of all the words
  int cost = 0;
  for (int word_idx = 0; word_idx < words.size(); word_idx++) {
    // convert each word back to UTF32 for analyzing case and punctuation
    string_32 str32;
    CubeUtils::UTF8ToUTF32(words[word_idx].c_str(), &str32);
    int len = CubeUtils::StrLen(str32.c_str());

    // strip all trailing punctuation
    string clean_str;
    int clean_len = len;
    bool trunc = false;
    while (clean_len > 0 &&
           lang_mod->IsTrailingPunc(str32.c_str()[clean_len - 1])) {
      --clean_len;
      trunc = true;
    }

    // If either the original string was not truncated (no trailing
    // punctuation) or the entire string was removed (all characters
    // are trailing punctuation), evaluate original word as is;
    // otherwise, copy all but the trailing punctuation characters
    char_32 *clean_str32 = NULL;
    if (clean_len == 0 || !trunc) {
      clean_str32 = CubeUtils::StrDup(str32.c_str());
    } else {
      clean_str32 = new char_32[clean_len + 1];
      for (int i = 0; i < clean_len; ++i) {
        clean_str32[i] = str32[i];
      }
      clean_str32[clean_len] = '\0';
    }
    ASSERT_HOST(clean_str32 != NULL);

    string str8;
    CubeUtils::UTF32ToUTF8(clean_str32, &str8);
    int word_cost = CostInternal(str8.c_str());

    // if case invariant, get costs of all-upper-case and all-lower-case
    // versions and return the min cost
    if (clean_len >= kMinLengthNumOrCaseInvariant &&
        CubeUtils::IsCaseInvariant(clean_str32, char_set)) {
      char_32 *lower_32 = CubeUtils::ToLower(clean_str32, char_set);
      if (lower_32) {
        string lower_8;
        CubeUtils::UTF32ToUTF8(lower_32, &lower_8);
        word_cost = MIN(word_cost, CostInternal(lower_8.c_str()));
        delete [] lower_32;
      }
      char_32 *upper_32 = CubeUtils::ToUpper(clean_str32, char_set);
      if (upper_32) {
        string upper_8;
        CubeUtils::UTF32ToUTF8(upper_32, &upper_8);
        word_cost = MIN(word_cost, CostInternal(upper_8.c_str()));
        delete [] upper_32;
      }
    }

    if (clean_len >= kMinLengthNumOrCaseInvariant) {
      // if characters are all numeric, incur 0 word cost
      bool is_numeric = true;
      for (int i = 0; i < clean_len; ++i) {
        if (!lang_mod->IsDigit(clean_str32[i]))
          is_numeric = false;
      }
      if (is_numeric)
        word_cost = 0;
    }
    delete [] clean_str32;
    cost += word_cost;
  }  // word_idx

  // return the mean cost
  return static_cast<int>(cost / static_cast<double>(words.size()));
}
Beispiel #11
0
/**
 * Segment the page according to the current value of tessedit_pageseg_mode.
 * pix_binary_ is used as the source image and should not be NULL.
 * On return the blocks list owns all the constructed page layout.
 */
int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
                           Tesseract* osd_tess, OSResults* osr) {
  ASSERT_HOST(pix_binary_ != NULL);
  int width = pixGetWidth(pix_binary_);
  int height = pixGetHeight(pix_binary_);
  // Get page segmentation mode.
  PageSegMode pageseg_mode = static_cast<PageSegMode>(
      static_cast<int>(tessedit_pageseg_mode));
  // If a UNLV zone file can be found, use that instead of segmentation.
  if (!PSM_COL_FIND_ENABLED(pageseg_mode) &&
      input_file != NULL && input_file->length() > 0) {
    STRING name = *input_file;
    const char* lastdot = strrchr(name.string(), '.');
    if (lastdot != NULL)
      name[lastdot - name.string()] = '\0';
    read_unlv_file(name, width, height, blocks);
  }
  if (blocks->empty()) {
    // No UNLV file present. Work according to the PageSegMode.
    // First make a single block covering the whole image.
    BLOCK_IT block_it(blocks);
    BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height);
    block->set_right_to_left(right_to_left());
    block_it.add_to_end(block);
  } else {
    // UNLV file present. Use PSM_SINGLE_BLOCK.
    pageseg_mode = PSM_SINGLE_BLOCK;
  }
  int auto_page_seg_ret_val = 0;
  TO_BLOCK_LIST to_blocks;
  if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
      PSM_SPARSE(pageseg_mode)) {
    auto_page_seg_ret_val =
        AutoPageSeg(pageseg_mode, blocks, &to_blocks, osd_tess, osr);
    if (pageseg_mode == PSM_OSD_ONLY)
      return auto_page_seg_ret_val;
    // To create blobs from the image region bounds uncomment this line:
    //  to_blocks.clear();  // Uncomment to go back to the old mode.
  } else {
    deskew_ = FCOORD(1.0f, 0.0f);
    reskew_ = FCOORD(1.0f, 0.0f);
    if (pageseg_mode == PSM_CIRCLE_WORD) {
      Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_);
      if (pixcleaned != NULL) {
        pixDestroy(&pix_binary_);
        pix_binary_ = pixcleaned;
      }
    }
  }

  if (auto_page_seg_ret_val < 0) {
    return -1;
  }

  if (blocks->empty()) {
    if (textord_debug_tabfind)
      tprintf("Empty page\n");
    return 0;  // AutoPageSeg found an empty page.
  }

  textord_.TextordPage(pageseg_mode, width, height, pix_binary_,
                       blocks, &to_blocks);
  return auto_page_seg_ret_val;
}
Beispiel #12
0
/*************************************************************************
 * write_results()
 *
 * All recognition and rejection has now been done. Generate the following:
 *   .txt file     - giving the final best choices with NO highlighting
 *   .raw file     - giving the tesseract top choice output for each word
 *   .map file     - showing how the .txt file has been rejected in the .ep file
 *   epchoice list - a list of one element per word, containing the text for the
 *                   epaper. Reject strings are inserted.
 *   inset list    - a list of bounding boxes of reject insets - indexed by the
 *                   reject strings in the epchoice text.
 *************************************************************************/
void Tesseract::write_results(PAGE_RES_IT &page_res_it,
                              char newline_type,  // type of newline
                              BOOL8 force_eol) {  // override tilde crunch?
  WERD_RES *word = page_res_it.word();
  const UNICHARSET &uchset = *word->uch_set;
  STRING repetition_code;
  const STRING *wordstr;
  STRING wordstr_lengths;
  int i;
  char unrecognised = STRING (unrecognised_char)[0];
  char ep_chars[32];             //Only for unlv_tilde_crunch
  int ep_chars_index = 0;
  char txt_chs[32];              //Only for unlv_tilde_crunch
  char map_chs[32];              //Only for unlv_tilde_crunch
  int txt_index = 0;
  BOOL8 need_reject = FALSE;
  UNICHAR_ID space = uchset.unichar_to_id(" ");
  if ((word->unlv_crunch_mode != CR_NONE ||
       word->best_choice->length() == 0) &&
      !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
    if ((word->unlv_crunch_mode != CR_DELETE) &&
        (!stats_.tilde_crunch_written ||
         ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
          (word->word->space () > 0) &&
          !word->word->flag (W_FUZZY_NON) &&
          !word->word->flag (W_FUZZY_SP)))) {
      if (!word->word->flag (W_BOL) &&
          (word->word->space () > 0) &&
          !word->word->flag (W_FUZZY_NON) &&
          !word->word->flag (W_FUZZY_SP)) {
        // Write a space to separate from preceeding good text.
        txt_chs[txt_index] = ' ';
        map_chs[txt_index++] = '1';
        ep_chars[ep_chars_index++] = ' ';
        stats_.last_char_was_tilde = false;
      }
      need_reject = TRUE;
    }
    if ((need_reject && !stats_.last_char_was_tilde) ||
        (force_eol && stats_.write_results_empty_block)) {
      /* Write a reject char - mark as rejected unless zero_rejection mode */
      stats_.last_char_was_tilde = TRUE;
      txt_chs[txt_index] = unrecognised;
      if (tessedit_zero_rejection || (suspect_level == 0)) {
        map_chs[txt_index++] = '1';
        ep_chars[ep_chars_index++] = unrecognised;
      }
      else {
        map_chs[txt_index++] = '0';
        /*
           The ep_choice string is a faked reject to allow newdiff to sync the
           .etx with the .txt and .map files.
         */
        ep_chars[ep_chars_index++] = CTRL_INSET; // escape code
                                 //dummy reject
        ep_chars[ep_chars_index++] = 1;
                                 //dummy reject
        ep_chars[ep_chars_index++] = 1;
                                 //type
        ep_chars[ep_chars_index++] = 2;
                                 //dummy reject
        ep_chars[ep_chars_index++] = 1;
                                 //dummy reject
        ep_chars[ep_chars_index++] = 1;
      }
      stats_.tilde_crunch_written = true;
      stats_.last_char_was_newline = false;
      stats_.write_results_empty_block = false;
    }

    if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
      /* Add a new line output */
      txt_chs[txt_index] = '\n';
      map_chs[txt_index++] = '\n';
                                 //end line
      ep_chars[ep_chars_index++] = newline_type;

                                 //Cos of the real newline
      stats_.tilde_crunch_written = false;
      stats_.last_char_was_newline = true;
      stats_.last_char_was_tilde = false;
    }
    txt_chs[txt_index] = '\0';
    map_chs[txt_index] = '\0';
    ep_chars[ep_chars_index] = '\0';  // terminate string
    word->ep_choice = new WERD_CHOICE(ep_chars, uchset);

    if (force_eol)
      stats_.write_results_empty_block = true;
    return;
  }

  /* NORMAL PROCESSING of non tilde crunched words */

  stats_.tilde_crunch_written = false;
  if (newline_type)
    stats_.last_char_was_newline = true;
  else
    stats_.last_char_was_newline = false;
  stats_.write_results_empty_block = force_eol;  // about to write a real word

  if (unlv_tilde_crunching &&
      stats_.last_char_was_tilde &&
      (word->word->space() == 0) &&
      !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
      (word->best_choice->unichar_id(0) == space)) {
    /* Prevent adjacent tilde across words - we know that adjacent tildes within
       words have been removed */
    word->best_choice->remove_unichar_id(0);
    if (word->best_choice->blob_choices() != NULL) {
      BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices());
      if (!blob_choices_it.empty()) delete blob_choices_it.extract();
    }
    word->reject_map.remove_pos (0);
    word->box_word->DeleteBox(0);
  }
  if (newline_type ||
    (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
    stats_.last_char_was_tilde = false;
  else {
    if (word->reject_map.length () > 0) {
      if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
        stats_.last_char_was_tilde = true;
      else
        stats_.last_char_was_tilde = false;
    }
    else if (word->word->space () > 0)
      stats_.last_char_was_tilde = false;
    /* else it is unchanged as there are no output chars */
  }

  ASSERT_HOST (word->best_choice->length() == word->reject_map.length());

  set_unlv_suspects(word);
  check_debug_pt (word, 120);
  if (tessedit_rejection_debug) {
    tprintf ("Dict word: \"%s\": %d\n",
             word->best_choice->debug_string().string(),
             dict_word(*(word->best_choice)));
  }
  if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
    repetition_code = "|^~R";
    wordstr_lengths = "\001\001\001\001";
    repetition_code += uchset.id_to_unichar(get_rep_char(word));
    wordstr_lengths += strlen(uchset.id_to_unichar(get_rep_char(word)));
    wordstr = &repetition_code;
  } else {
    if (tessedit_zero_rejection) {
      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
      for (i = 0; i < word->best_choice->length(); ++i) {
        if (word->reject_map[i].rejected())
          word->reject_map[i].setrej_minimal_rej_accept();
      }
    }
    if (tessedit_minimal_rejection) {
      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
      for (i = 0; i < word->best_choice->length(); ++i) {
        if ((word->best_choice->unichar_id(i) != space) &&
            word->reject_map[i].rejected())
          word->reject_map[i].setrej_minimal_rej_accept();
      }
    }
  }
}
Beispiel #13
0
// Segment the page according to the current value of tessedit_pageseg_mode.
// If the pix_binary_ member is not NULL, it is used as the source image,
// and copied to image, otherwise it just uses image as the input.
// On return the blocks list owns all the constructed page layout.
int Tesseract::SegmentPage(const STRING* input_file,
                           IMAGE* image, BLOCK_LIST* blocks) {
  int width = image->get_xsize();
  int height = image->get_ysize();
  int resolution = image->get_res();
#ifdef HAVE_LIBLEPT
  if (pix_binary_ != NULL) {
    width = pixGetWidth(pix_binary_);
    height = pixGetHeight(pix_binary_);
    resolution = pixGetXRes(pix_binary_);
  }
#endif
  // Zero resolution messes up the algorithms, so make sure it is credible.
  if (resolution < kMinCredibleResolution)
    resolution = kDefaultResolution;
  // Get page segmentation mode.
  PageSegMode pageseg_mode = static_cast<PageSegMode>(
      static_cast<int>(tessedit_pageseg_mode));
  // If a UNLV zone file can be found, use that instead of segmentation.
  if (pageseg_mode != tesseract::PSM_AUTO &&
      input_file != NULL && input_file->length() > 0) {
    STRING name = *input_file;
    const char* lastdot = strrchr(name.string(), '.');
    if (lastdot != NULL)
      name[lastdot - name.string()] = '\0';
    read_unlv_file(name, width, height, blocks);
  }
  bool single_column = pageseg_mode > PSM_AUTO;
  if (blocks->empty()) {
    // No UNLV file present. Work according to the PageSegMode.
    // First make a single block covering the whole image.
    BLOCK_IT block_it(blocks);
    BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height);
    block_it.add_to_end(block);
  } else {
    // UNLV file present. Use PSM_SINGLE_COLUMN.
    pageseg_mode = PSM_SINGLE_COLUMN;
  }

  TO_BLOCK_LIST land_blocks, port_blocks;
  TBOX page_box;
  if (pageseg_mode <= PSM_SINGLE_COLUMN) {
    if (AutoPageSeg(width, height, resolution, single_column,
                    image, blocks, &port_blocks) < 0) {
      return -1;
    }
    // To create blobs from the image region bounds uncomment this line:
    //  port_blocks.clear();  // Uncomment to go back to the old mode.
  } else {
#if HAVE_LIBLEPT
    image->FromPix(pix_binary_);
#endif
    deskew_ = FCOORD(1.0f, 0.0f);
    reskew_ = FCOORD(1.0f, 0.0f);
  }
  if (blocks->empty()) {
    tprintf("Empty page\n");
    return 0;  // AutoPageSeg found an empty page.
  }

  if (port_blocks.empty()) {
    // AutoPageSeg was not used, so we need to find_components first.
    find_components(blocks, &land_blocks, &port_blocks, &page_box);
  } else {
    // AutoPageSeg does not need to find_components as it did that already.
    page_box.set_left(0);
    page_box.set_bottom(0);
    page_box.set_right(width);
    page_box.set_top(height);
    // Filter_blobs sets up the TO_BLOCKs the same as find_components does.
    filter_blobs(page_box.topright(), &port_blocks, true);
  }

  TO_BLOCK_IT to_block_it(&port_blocks);
  ASSERT_HOST(!port_blocks.empty());
  TO_BLOCK* to_block = to_block_it.data();
  if (pageseg_mode <= PSM_SINGLE_BLOCK ||
      to_block->line_size < 2) {
    // For now, AUTO, SINGLE_COLUMN and SINGLE_BLOCK all map to the old
    // textord. The difference is the number of blocks and how the are made.
    textord_page(page_box.topright(), blocks, &land_blocks, &port_blocks,
                 this);
  } else {
    // SINGLE_LINE, SINGLE_WORD and SINGLE_CHAR all need a single row.
    float gradient = make_single_row(page_box.topright(),
                                     to_block, &port_blocks, this);
    if (pageseg_mode == PSM_SINGLE_LINE) {
      // SINGLE_LINE uses the old word maker on the single line.
      make_words(page_box.topright(), gradient, blocks,
                 &land_blocks, &port_blocks, this);
    } else {
      // SINGLE_WORD and SINGLE_CHAR cram all the blobs into a
      // single word, and in SINGLE_CHAR mode, all the outlines
      // go in a single blob.
      make_single_word(pageseg_mode == PSM_SINGLE_CHAR,
                       to_block->get_rows(), to_block->block->row_list());
    }
  }
  return 0;
}
Beispiel #14
0
// Auto page segmentation. Divide the page image into blocks of uniform
// text linespacing and images.
// Width, height and resolution are derived from the input image.
// If the pix is non-NULL, then it is assumed to be the input, and it is
// copied to the image, otherwise the image is used directly.
// The output goes in the blocks list with corresponding TO_BLOCKs in the
// to_blocks list.
// If single_column is true, then no attempt is made to divide the image
// into columns, but multiple blocks are still made if the text is of
// non-uniform linespacing.
int Tesseract::AutoPageSeg(int width, int height, int resolution,
                           bool single_column, IMAGE* image,
                           BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks) {
  int vertical_x = 0;
  int vertical_y = 1;
  TabVector_LIST v_lines;
  TabVector_LIST h_lines;
  ICOORD bleft(0, 0);
  Boxa* boxa = NULL;
  Pixa* pixa = NULL;
  // The blocks made by the ColumnFinder. Moved to blocks before return.
  BLOCK_LIST found_blocks;

#ifdef HAVE_LIBLEPT
  if (pix_binary_ != NULL) {
    if (textord_debug_images) {
      Pix* grey_pix = pixCreate(width, height, 8);
      // Printable images are light grey on white, but for screen display
      // they are black on dark grey so the other colors show up well.
      if (textord_debug_printable) {
        pixSetAll(grey_pix);
        pixSetMasked(grey_pix, pix_binary_, 192);
      } else {
        pixSetAllArbitrary(grey_pix, 64);
        pixSetMasked(grey_pix, pix_binary_, 0);
      }
      AlignedBlob::IncrementDebugPix();
      pixWrite(AlignedBlob::textord_debug_pix().string(), grey_pix, IFF_PNG);
      pixDestroy(&grey_pix);
    }
    if (tessedit_dump_pageseg_images)
      pixWrite("tessinput.png", pix_binary_, IFF_PNG);
    // Leptonica is used to find the lines and image regions in the input.
    LineFinder::FindVerticalLines(resolution, pix_binary_,
                                  &vertical_x, &vertical_y, &v_lines);
    LineFinder::FindHorizontalLines(resolution, pix_binary_, &h_lines);
    if (tessedit_dump_pageseg_images)
      pixWrite("tessnolines.png", pix_binary_, IFF_PNG);
    ImageFinder::FindImages(pix_binary_, &boxa, &pixa);
    if (tessedit_dump_pageseg_images)
      pixWrite("tessnoimages.png", pix_binary_, IFF_PNG);
    // Copy the Pix to the IMAGE.
    image->FromPix(pix_binary_);
    if (single_column)
      v_lines.clear();
  }
#endif
  TO_BLOCK_LIST land_blocks, port_blocks;
  TBOX page_box;
  // The rest of the algorithm uses the usual connected components.
  find_components(blocks, &land_blocks, &port_blocks, &page_box);

  TO_BLOCK_IT to_block_it(&port_blocks);
  ASSERT_HOST(!to_block_it.empty());
  for (to_block_it.mark_cycle_pt(); !to_block_it.cycled_list();
       to_block_it.forward()) {
    TO_BLOCK* to_block = to_block_it.data();
    TBOX blkbox = to_block->block->bounding_box();
    if (to_block->line_size >= 2) {
      // Note: if there are multiple blocks, then v_lines, boxa, and pixa
      // are empty on the next iteration, but in this case, we assume
      // that there aren't any interesting line separators or images, since
      // it means that we have a pre-defined unlv zone file.
      ColumnFinder finder(static_cast<int>(to_block->line_size),
                          blkbox.botleft(), blkbox.topright(),
                          &v_lines, &h_lines, vertical_x, vertical_y);
      if (finder.FindBlocks(height, resolution, single_column,
                            to_block, boxa, pixa, &found_blocks, to_blocks) < 0)
        return -1;
      finder.ComputeDeskewVectors(&deskew_, &reskew_);
      boxa = NULL;
      pixa = NULL;
    }
  }
#ifdef HAVE_LIBLEPT
  boxaDestroy(&boxa);
  pixaDestroy(&pixa);
#endif
  blocks->clear();
  BLOCK_IT block_it(blocks);
  // Move the found blocks to the input/output blocks.
  block_it.add_list_after(&found_blocks);

  if (textord_debug_images) {
    // The debug image is no longer needed so delete it.
    unlink(AlignedBlob::textord_debug_pix().string());
  }
  return 0;
}
Beispiel #15
0
// Make the textlines and words inside each block.
void Textord::TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew,
                          int width, int height, Pix *binary_pix,
                          Pix *thresholds_pix, Pix *grey_pix,
                          bool use_box_bottoms,
                          BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks) {
  page_tr_.set_x(width);
  page_tr_.set_y(height);
  if (to_blocks->empty()) {
    // AutoPageSeg was not used, so we need to find_components first.
    find_components(binary_pix, blocks, to_blocks);
    TO_BLOCK_IT it(to_blocks);
    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
      TO_BLOCK *to_block = it.data();
      // Compute the edge offsets whether or not there is a grey_pix.
      // We have by-passed auto page seg, so we have to run it here.
      // By page segmentation mode there is no non-text to avoid running on.
      to_block->ComputeEdgeOffsets(thresholds_pix, grey_pix);
    }
  } else if (!PSM_SPARSE(pageseg_mode)) {
    // AutoPageSeg does not need to find_components as it did that already.
    // Filter_blobs sets up the TO_BLOCKs the same as find_components does.
    filter_blobs(page_tr_, to_blocks, true);
  }

  ASSERT_HOST(!to_blocks->empty());
  if (pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT) {
    const FCOORD anticlockwise90(0.0f, 1.0f);
    const FCOORD clockwise90(0.0f, -1.0f);
    TO_BLOCK_IT it(to_blocks);
    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
      TO_BLOCK *to_block = it.data();
      BLOCK *block = to_block->block;
      // Create a fake poly_block in block from its bounding box.
      block->set_poly_block(new POLY_BLOCK(block->bounding_box(),
                                           PT_VERTICAL_TEXT));
      // Rotate the to_block along with its contained block and blobnbox lists.
      to_block->rotate(anticlockwise90);
      // Set the block's rotation values to obey the convention followed in
      // layout analysis for vertical text.
      block->set_re_rotation(clockwise90);
      block->set_classify_rotation(clockwise90);
    }
  }

  TO_BLOCK_IT to_block_it(to_blocks);
  TO_BLOCK *to_block = to_block_it.data();
  // Make the rows in the block.
  float gradient = 0;
  // Do it the old fashioned way.
  if (PSM_LINE_FIND_ENABLED(pageseg_mode)) {
    gradient = make_rows(page_tr_, to_blocks);
  } else if (!PSM_SPARSE(pageseg_mode)) {
    // RAW_LINE, SINGLE_LINE, SINGLE_WORD and SINGLE_CHAR all need a single row.
    gradient = make_single_row(page_tr_, pageseg_mode != PSM_RAW_LINE,
                               to_block, to_blocks);
  }
  BaselineDetect baseline_detector(textord_baseline_debug,
                                   reskew, to_blocks);
  baseline_detector.ComputeStraightBaselines(use_box_bottoms);
  baseline_detector.ComputeBaselineSplinesAndXheights(page_tr_, true,
      textord_heavy_nr,
      textord_show_final_rows,
      this);
  // Now make the words in the lines.
  if (PSM_WORD_FIND_ENABLED(pageseg_mode)) {
    // SINGLE_LINE uses the old word maker on the single line.
    make_words(this, page_tr_, gradient, blocks, to_blocks);
  } else {
    // SINGLE_WORD and SINGLE_CHAR cram all the blobs into a
    // single word, and in SINGLE_CHAR mode, all the outlines
    // go in a single blob.
    TO_BLOCK *to_block = to_block_it.data();
    make_single_word(pageseg_mode == PSM_SINGLE_CHAR,
                     to_block->get_rows(), to_block->block->row_list());
  }
  cleanup_blocks(PSM_WORD_FIND_ENABLED(pageseg_mode), blocks);
  // Remove empties.

  // Compute the margins for each row in the block, to be used later for
  // paragraph detection.
  BLOCK_IT b_it(blocks);
  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
    b_it.data()->compute_row_margins();
  }
#ifndef GRAPHICS_DISABLED
  close_to_win();
#endif
}
C_OUTLINE::C_OUTLINE(                     //constructor
                     C_OUTLINE *srcline,  //outline to
                     FCOORD rotation      //rotate
                    ) {
  TBOX new_box;                   //easy bounding
  inT16 stepindex;               //index to step
  inT16 dirdiff;                 //direction change
  ICOORD pos;                    //current position
  ICOORD prevpos;                //previous dest point

  ICOORD destpos;                //destination point
  inT16 destindex;               //index to step
  DIR128 dir;                    //coded direction
  uinT8 new_step;

  stepcount = srcline->stepcount * 2;
                                 //get memory
  steps = (uinT8 *) alloc_mem (step_mem());
  memset(steps, 0, step_mem());

  for (int iteration = 0; iteration < 2; ++iteration) {
    DIR128 round1 = iteration == 0 ? 32 : 0;
    DIR128 round2 = iteration != 0 ? 32 : 0;
    pos = srcline->start;
    prevpos = pos;
    prevpos.rotate (rotation);
    start = prevpos;
    box = TBOX (start, start);
    destindex = 0;
    for (stepindex = 0; stepindex < srcline->stepcount; stepindex++) {
      pos += srcline->step (stepindex);
      destpos = pos;
      destpos.rotate (rotation);
      //  printf("%i %i %i %i ", destpos.x(), destpos.y(), pos.x(), pos.y());
      while (destpos.x () != prevpos.x () || destpos.y () != prevpos.y ()) {
        dir = DIR128 (FCOORD (destpos - prevpos));
        dir += 64;                 //turn to step style
        new_step = dir.get_dir ();
        //  printf(" %i\n", new_step);
        if (new_step & 31) {
          set_step(destindex++, dir + round1);
          prevpos += step(destindex - 1);
          if (destindex < 2
            || ((dirdiff =
            step_dir (destindex - 1) - step_dir (destindex - 2)) !=
            -64 && dirdiff != 64)) {
            set_step(destindex++, dir + round2);
            prevpos += step(destindex - 1);
          } else {
            prevpos -= step(destindex - 1);
            destindex--;
            prevpos -= step(destindex - 1);
            set_step(destindex - 1, dir + round2);
            prevpos += step(destindex - 1);
          }
        }
        else {
          set_step(destindex++, dir);
          prevpos += step(destindex - 1);
        }
        while (destindex >= 2 &&
               ((dirdiff =
                 step_dir (destindex - 1) - step_dir (destindex - 2)) == -64 ||
                dirdiff == 64)) {
          prevpos -= step(destindex - 1);
          prevpos -= step(destindex - 2);
          destindex -= 2;        // Forget u turn
        }
        //ASSERT_HOST(prevpos.x() == destpos.x() && prevpos.y() == destpos.y());
        new_box = TBOX (destpos, destpos);
        box += new_box;
      }
    }
    ASSERT_HOST (destpos.x () == start.x () && destpos.y () == start.y ());
    dirdiff = step_dir (destindex - 1) - step_dir (0);
    while ((dirdiff == 64 || dirdiff == -64) && destindex > 1) {
      start += step (0);
      destindex -= 2;
      for (int i = 0; i < destindex; ++i)
        set_step(i, step_dir(i + 1));
      dirdiff = step_dir (destindex - 1) - step_dir (0);
    }
    if (destindex >= 4)
      break;
  }
  stepcount = destindex;
  destpos = start;
  for (stepindex = 0; stepindex < stepcount; stepindex++) {
    destpos += step (stepindex);
  }
  ASSERT_HOST (destpos.x () == start.x () && destpos.y () == start.y ());
}
Beispiel #17
0
/**
 * Segment the page according to the current value of tessedit_pageseg_mode.
 * pix_binary_ is used as the source image and should not be NULL.
 * On return the blocks list owns all the constructed page layout.
 */
int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
                           Tesseract* osd_tess, OSResults* osr) {
  ASSERT_HOST(pix_binary_ != NULL);
  int width = pixGetWidth(pix_binary_);
  int height = pixGetHeight(pix_binary_);
  // Get page segmentation mode.
  PageSegMode pageseg_mode = static_cast<PageSegMode>(
      static_cast<int>(tessedit_pageseg_mode));
  // If a UNLV zone file can be found, use that instead of segmentation.
  if (!PSM_COL_FIND_ENABLED(pageseg_mode) &&
      input_file != NULL && input_file->length() > 0) {
    STRING name = *input_file;
    const char* lastdot = strrchr(name.string(), '.');
    if (lastdot != NULL)
      name[lastdot - name.string()] = '\0';
    read_unlv_file(name, width, height, blocks);
  }
  if (blocks->empty()) {
    // No UNLV file present. Work according to the PageSegMode.
    // First make a single block covering the whole image.
    BLOCK_IT block_it(blocks);
    BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height);
    block->set_right_to_left(right_to_left());
    block_it.add_to_end(block);
  } else {
    // UNLV file present. Use PSM_SINGLE_BLOCK.
    pageseg_mode = PSM_SINGLE_BLOCK;
  }
  // The diacritic_blobs holds noise blobs that may be diacritics. They
  // are separated out on areas of the image that seem noisy and short-circuit
  // the layout process, going straight from the initial partition creation
  // right through to after word segmentation, where they are added to the
  // rej_cblobs list of the most appropriate word. From there classification
  // will determine whether they are used.
  BLOBNBOX_LIST diacritic_blobs;
  int auto_page_seg_ret_val = 0;
  TO_BLOCK_LIST to_blocks;
  if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
      PSM_SPARSE(pageseg_mode)) {
    auto_page_seg_ret_val = AutoPageSeg(
        pageseg_mode, blocks, &to_blocks,
        enable_noise_removal ? &diacritic_blobs : NULL, osd_tess, osr);
    if (pageseg_mode == PSM_OSD_ONLY)
      return auto_page_seg_ret_val;
    // To create blobs from the image region bounds uncomment this line:
    //  to_blocks.clear();  // Uncomment to go back to the old mode.
  } else {
    deskew_ = FCOORD(1.0f, 0.0f);
    reskew_ = FCOORD(1.0f, 0.0f);
    if (pageseg_mode == PSM_CIRCLE_WORD) {
      Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_);
      if (pixcleaned != NULL) {
        pixDestroy(&pix_binary_);
        pix_binary_ = pixcleaned;
      }
    }
  }

  if (auto_page_seg_ret_val < 0) {
    return -1;
  }

  if (blocks->empty()) {
    if (textord_debug_tabfind)
      tprintf("Empty page\n");
    return 0;  // AutoPageSeg found an empty page.
  }
  bool splitting =
      pageseg_devanagari_split_strategy != ShiroRekhaSplitter::NO_SPLIT;
  bool cjk_mode = textord_use_cjk_fp_model;

  textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_,
                       pix_thresholds_, pix_grey_, splitting || cjk_mode,
                       &diacritic_blobs, blocks, &to_blocks);
  return auto_page_seg_ret_val;
}
Beispiel #18
0
/**
 *  word_display()  Word Processor
 *
 *  Display a word according to its display modes
 */
BOOL8 Tesseract::word_display(PAGE_RES_IT* pr_it) {
  WERD_RES* word_res = pr_it->word();
  WERD* word = word_res->word;
  TBOX word_bb;                   // word bounding box
  int word_height;               // ht of word BB
  BOOL8 displayed_something = FALSE;
  float shift;                   // from bot left
  C_BLOB_IT c_it;                // cblob iterator

  if (color_mode != CM_RAINBOW && word_res->box_word != NULL) {
    BoxWord* box_word = word_res->box_word;
    WERD_CHOICE* best_choice = word_res->best_choice;
    int length = box_word->length();
    if (word_res->fontinfo == NULL) return false;
    const FontInfo& font_info = *word_res->fontinfo;
    for (int i = 0; i < length; ++i) {
      ScrollView::Color color = ScrollView::GREEN;
      switch (color_mode) {
        case CM_SUBSCRIPT:
          if (best_choice->BlobPosition(i) == SP_SUBSCRIPT)
            color = ScrollView::RED;
          break;
        case CM_SUPERSCRIPT:
          if (best_choice->BlobPosition(i) == SP_SUPERSCRIPT)
            color = ScrollView::RED;
          break;
        case CM_ITALIC:
          if (font_info.is_italic())
            color = ScrollView::RED;
          break;
        case CM_BOLD:
          if (font_info.is_bold())
            color = ScrollView::RED;
          break;
        case CM_FIXEDPITCH:
          if (font_info.is_fixed_pitch())
            color = ScrollView::RED;
          break;
        case CM_SERIF:
          if (font_info.is_serif())
            color = ScrollView::RED;
          break;
        case CM_SMALLCAPS:
          if (word_res->small_caps)
            color = ScrollView::RED;
          break;
        case CM_DROPCAPS:
          if (best_choice->BlobPosition(i) == SP_DROPCAP)
            color = ScrollView::RED;
          break;
          // TODO(rays) underline is currently completely unsupported.
        case CM_UNDERLINE:
        default:
          break;
      }
      image_win->Pen(color);
      TBOX box = box_word->BlobBox(i);
      image_win->Rectangle(box.left(), box.bottom(), box.right(), box.top());
    }
    return true;
  }
  /*
    Note the double coercions of(COLOUR)((inT32)editor_image_word_bb_color)
    etc. are to keep the compiler happy.
  */
                                 // display bounding box
  if (word->display_flag(DF_BOX)) {
    word->bounding_box().plot(image_win,
     (ScrollView::Color)((inT32)
      editor_image_word_bb_color),
     (ScrollView::Color)((inT32)
      editor_image_word_bb_color));

    ScrollView::Color c = (ScrollView::Color)
       ((inT32) editor_image_blob_bb_color);
    image_win->Pen(c);
    c_it.set_to_list(word->cblob_list());
    for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward())
      c_it.data()->bounding_box().plot(image_win);
    displayed_something = TRUE;
  }

                                 // display edge steps
  if (word->display_flag(DF_EDGE_STEP)) {     // edgesteps available
    word->plot(image_win);      // rainbow colors
    displayed_something = TRUE;
  }

                                 // display poly approx
  if (word->display_flag(DF_POLYGONAL)) {
                                 // need to convert
    TWERD* tword = TWERD::PolygonalCopy(poly_allow_detailed_fx, word);
    tword->plot(image_win);
    delete tword;
    displayed_something = TRUE;
  }

  // Display correct text and blamer information.
  STRING text;
  STRING blame;
  if (word->display_flag(DF_TEXT) && word->text() != NULL) {
    text = word->text();
  }
  if (word->display_flag(DF_BLAMER) &&
      !(word_res->blamer_bundle != NULL &&
        word_res->blamer_bundle->incorrect_result_reason() == IRR_CORRECT)) {
    text = "";
    const BlamerBundle *blamer_bundle = word_res->blamer_bundle;
    if (blamer_bundle == NULL) {
      text += "NULL";
    } else {
      text = blamer_bundle->TruthString();
    }
    text += " -> ";
    STRING best_choice_str;
    if (word_res->best_choice == NULL) {
      best_choice_str = "NULL";
    } else {
      word_res->best_choice->string_and_lengths(&best_choice_str, NULL);
    }
    text += best_choice_str;
    IncorrectResultReason reason = (blamer_bundle == NULL) ?
        IRR_PAGE_LAYOUT : blamer_bundle->incorrect_result_reason();
    ASSERT_HOST(reason < IRR_NUM_REASONS)
    blame += " [";
    blame += BlamerBundle::IncorrectReasonName(reason);
    blame += "]";
  }
  if (text.length() > 0) {
    word_bb = word->bounding_box();
    image_win->Pen(ScrollView::RED);
    word_height = word_bb.height();
    int text_height = 0.50 * word_height;
    if (text_height > 20) text_height = 20;
    image_win->TextAttributes("Arial", text_height, false, false, false);
    shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f;
    image_win->Text(word_bb.left() + shift,
                    word_bb.bottom() + 0.25 * word_height, text.string());
    if (blame.length() > 0) {
      image_win->Text(word_bb.left() + shift,
                      word_bb.bottom() + 0.25 * word_height - text_height,
                      blame.string());
    }

    displayed_something = TRUE;
  }

  if (!displayed_something)      // display BBox anyway
    word->bounding_box().plot(image_win,
     (ScrollView::Color)((inT32) editor_image_word_bb_color),
     (ScrollView::Color)((inT32)
      editor_image_word_bb_color));
  return TRUE;
}
/**
 * rebuild_current_state
 *
 * Evaluate the segmentation that is represented by this state in the
 * best first search.  Add this state to the "states_seen" list.
 */
BLOB_CHOICE_LIST_VECTOR *Wordrec::rebuild_current_state(
    TBLOB *blobs,
    SEAMS seam_list,
    STATE *state,
    BLOB_CHOICE_LIST_VECTOR *old_choices,
    int fx,
    bool force_rebuild,
    const WERD_CHOICE &best_choice,
    const MATRIX *ratings) {
  // Initialize search_state, num_joints, x, y.
  int num_joints = array_count(seam_list);
#ifndef GRAPHICS_DISABLED
    if (wordrec_display_segmentations) {
      print_state("Rebuiling state", state, num_joints);
    }
#endif
  SEARCH_STATE search_state = bin_to_chunks(state, num_joints);
  int x = 0;
  int y;
  int i;
  for (i = 1; i <= search_state[0]; i++) {
    y = x + search_state[i];
    x = y + 1;
  }
  y = count_blobs (blobs) - 1;

  // Initialize char_choices, expanded_fragment_lengths:
  // e.g. if fragment_lengths = {1 1 2 3 1},
  // expanded_fragment_lengths_str = {1 1 2 2 3 3 3 1}.
  BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
  STRING expanded_fragment_lengths_str = "";
  bool state_has_fragments = false;
  const char *fragment_lengths = NULL;

  if (best_choice.length() > 0) {
    fragment_lengths = best_choice.fragment_lengths();
  }
  if (fragment_lengths) {
    for (int i = 0; i < best_choice.length(); ++i) {
      *char_choices += NULL;
      if (fragment_lengths[i] > 1) {
        state_has_fragments = true;
      }
      for (int j = 0; j < fragment_lengths[i]; ++j) {
        expanded_fragment_lengths_str += fragment_lengths[i];
      }
    }
  } else {
    for (i = 0; i <= search_state[0]; ++i) {
      expanded_fragment_lengths_str += (char)1;
      *char_choices += NULL;
    }
  }

  // Finish early if force_rebuld is false and there are no fragments to merge.
  if (!force_rebuild && !state_has_fragments) {
    delete char_choices;
    memfree(search_state);
    return old_choices;
  }

  // Set up variables for concatenating fragments.
  const char *word_lengths_ptr = NULL;
  const char *word_ptr = NULL;
  if (state_has_fragments) {
    // Make word_lengths_ptr point to the last element in
    // best_choice->unichar_lengths().
    word_lengths_ptr = best_choice.unichar_lengths().string();
    word_lengths_ptr += (strlen(word_lengths_ptr)-1);
    // Make word_str point to the beginning of the last
    // unichar in best_choice->unichar_string().
    word_ptr = best_choice.unichar_string().string();
    word_ptr += (strlen(word_ptr)-*word_lengths_ptr);
  }
  const char *expanded_fragment_lengths =
    expanded_fragment_lengths_str.string();
  bool merging_fragment = false;
  int true_y = -1;
  char unichar[UNICHAR_LEN + 1];
  int fragment_pieces = -1;
  float rating = 0.0;
  float certainty = -MAX_FLOAT32;

  // Populate char_choices list such that it corresponds to search_state.
  //
  // If we are rebuilding a state that contains character fragments:
  // -- combine blobs that belong to character fragments
  // -- re-classify the blobs to obtain choices list for the merged blob
  // -- ensure that correct classification appears in the new choices list
  //    NOTE: a choice composed form original fragment choices will be always
  //    added to the new choices list for each character composed from
  //    fragments (even if the choice for the corresponding character appears
  //    in the re-classified choices list of for the newly merged blob).
  BLOB_CHOICE_IT temp_it;
  int char_choices_index = char_choices->length() - 1;
  for (i = search_state[0]; i >= 0; i--) {
    BLOB_CHOICE_LIST *current_choices = join_blobs_and_classify(
        blobs, seam_list, x, y, fx, ratings, old_choices);
    // Combine character fragments.
    if (expanded_fragment_lengths[i] > 1) {
      // Start merging character fragments.
      if (!merging_fragment) {
        merging_fragment = true;
        true_y = y;
        fragment_pieces = expanded_fragment_lengths[i];
        rating = 0.0;
        certainty = -MAX_FLOAT32;
        strncpy(unichar, word_ptr, *word_lengths_ptr);
        unichar[*word_lengths_ptr] = '\0';
      }
      // Take into account the fact that we could have joined pieces
      // since we first recorded the ending point of a fragment (true_y).
      true_y -= y - x;
      // Populate fragment with updated values and look for the
      // fragment with the same values in current_choices.
      // Update rating and certainty of the character being composed.
      fragment_pieces--;
      CHAR_FRAGMENT fragment;
      fragment.set_all(unichar, fragment_pieces,
                       expanded_fragment_lengths[i]);
      temp_it.set_to_list(current_choices);
      for (temp_it.mark_cycle_pt(); !temp_it.cycled_list();
           temp_it.forward()) {
        const CHAR_FRAGMENT *current_fragment =
          getDict().getUnicharset().get_fragment(temp_it.data()->unichar_id());
        if (current_fragment && fragment.equals(current_fragment)) {
          rating += temp_it.data()->rating();
          if (temp_it.data()->certainty() > certainty) {
            certainty = temp_it.data()->certainty();
          }
          break;
        }
      }
      assert(!temp_it.cycled_list());  // make sure we found the fragment
      // Free current_choices for the fragmented character.
      delete current_choices;

      // Finish composing character from fragments.
      if (fragment_pieces == 0) {
        // Populate current_choices with the classification of
        // the blob merged from blobs of each character fragment.
        current_choices = join_blobs_and_classify(blobs, seam_list, x,
                                                  true_y, fx, ratings, NULL);
        BLOB_CHOICE *merged_choice =
          new BLOB_CHOICE(getDict().getUnicharset().unichar_to_id(unichar),
                          rating, certainty, 0, NO_PERM);

        // Insert merged_blob into current_choices, such that current_choices
        // are still sorted in non-descending order by rating.
        ASSERT_HOST(!current_choices->empty());
        temp_it.set_to_list(current_choices);
        for (temp_it.mark_cycle_pt();
             !temp_it.cycled_list() &&
             merged_choice->rating() > temp_it.data()->rating();
             temp_it.forward());
        temp_it.add_before_stay_put(merged_choice);

        // Done merging this fragmented character.
        merging_fragment = false;
      }
    }
    if (!merging_fragment) {
      // Get rid of fragments in current_choices.
      temp_it.set_to_list(current_choices);
      for (temp_it.mark_cycle_pt(); !temp_it.cycled_list();
           temp_it.forward()) {
        if (getDict().getUnicharset().get_fragment(
            temp_it.data()->unichar_id())) {
          delete temp_it.extract();
        }
      }
      char_choices->set(current_choices, char_choices_index);
      char_choices_index--;

      // Update word_ptr and word_lengths_ptr.
      if (word_lengths_ptr != NULL && word_ptr != NULL) {
        word_lengths_ptr--;
        word_ptr -= (*word_lengths_ptr);
      }
    }
    y = x - 1;
    x = y - search_state[i];
  }
  old_choices->delete_data_pointers();
  delete old_choices;
  memfree(search_state);

  return (char_choices);
}
// Helper sets the character attribute properties and sets up the script table.
// Does not set tops and bottoms.
void SetupBasicProperties(bool report_errors, bool decompose,
                          UNICHARSET* unicharset) {
  for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) {
    // Convert any custom ligatures.
    const char* unichar_str = unicharset->id_to_unichar(unichar_id);
    for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr; ++i) {
      if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) {
        unichar_str = UNICHARSET::kCustomLigatures[i][0];
        break;
      }
    }

    // Convert the unichar to UTF32 representation
    std::vector<char32> uni_vector = UNICHAR::UTF8ToUTF32(unichar_str);

    // Assume that if the property is true for any character in the string,
    // then it holds for the whole "character".
    bool unichar_isalpha = false;
    bool unichar_islower = false;
    bool unichar_isupper = false;
    bool unichar_isdigit = false;
    bool unichar_ispunct = false;

    for (char32 u_ch : uni_vector) {
      if (u_isalpha(u_ch)) unichar_isalpha = true;
      if (u_islower(u_ch)) unichar_islower = true;
      if (u_isupper(u_ch)) unichar_isupper = true;
      if (u_isdigit(u_ch)) unichar_isdigit = true;
      if (u_ispunct(u_ch)) unichar_ispunct = true;
    }

    unicharset->set_isalpha(unichar_id, unichar_isalpha);
    unicharset->set_islower(unichar_id, unichar_islower);
    unicharset->set_isupper(unichar_id, unichar_isupper);
    unicharset->set_isdigit(unichar_id, unichar_isdigit);
    unicharset->set_ispunctuation(unichar_id, unichar_ispunct);

    tesseract::IcuErrorCode err;
    unicharset->set_script(unichar_id, uscript_getName(
        uscript_getScript(uni_vector[0], err)));

    const int num_code_points = uni_vector.size();
    // Obtain the lower/upper case if needed and record it in the properties.
    unicharset->set_other_case(unichar_id, unichar_id);
    if (unichar_islower || unichar_isupper) {
      std::vector<char32> other_case(num_code_points, 0);
      for (int i = 0; i < num_code_points; ++i) {
        // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used.
        // However since they deal with UChars (so need a conversion function
        // from char32 or UTF8string) and require a meaningful locale string,
        // for now u_tolower()/u_toupper() are used.
        other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) :
          u_tolower(uni_vector[i]);
      }
      std::string other_case_uch = UNICHAR::UTF32ToUTF8(other_case);
      UNICHAR_ID other_case_id =
          unicharset->unichar_to_id(other_case_uch.c_str());
      if (other_case_id != INVALID_UNICHAR_ID) {
        unicharset->set_other_case(unichar_id, other_case_id);
      } else if (unichar_id >= SPECIAL_UNICHAR_CODES_COUNT && report_errors) {
        tprintf("Other case %s of %s is not in unicharset\n",
                other_case_uch.c_str(), unichar_str);
      }
    }

    // Set RTL property and obtain mirror unichar ID from ICU.
    std::vector<char32> mirrors(num_code_points, 0);
    for (int i = 0; i < num_code_points; ++i) {
      mirrors[i] = u_charMirror(uni_vector[i]);
      if (i == 0) {  // set directionality to that of the 1st code point
        unicharset->set_direction(unichar_id,
                                  static_cast<UNICHARSET::Direction>(
                                      u_charDirection(uni_vector[i])));
      }
    }
    std::string mirror_uch = UNICHAR::UTF32ToUTF8(mirrors);
    UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str());
    if (mirror_uch_id != INVALID_UNICHAR_ID) {
      unicharset->set_mirror(unichar_id, mirror_uch_id);
    } else if (report_errors) {
      tprintf("Mirror %s of %s is not in unicharset\n",
              mirror_uch.c_str(), unichar_str);
    }

    // Record normalized version of this unichar.
    std::string normed_str;
    if (unichar_id != 0 &&
        tesseract::NormalizeUTF8String(
            decompose ? tesseract::UnicodeNormMode::kNFKD
                      : tesseract::UnicodeNormMode::kNFKC,
            tesseract::OCRNorm::kNormalize, tesseract::GraphemeNorm::kNone,
            unichar_str, &normed_str) &&
        !normed_str.empty()) {
      unicharset->set_normed(unichar_id, normed_str.c_str());
    } else {
      unicharset->set_normed(unichar_id, unichar_str);
    }
    ASSERT_HOST(unicharset->get_other_case(unichar_id) < unicharset->size());
  }
  unicharset->post_load_setup();
}
Beispiel #21
0
// Returns true if the given string is equivalent to the truth string for
// the current word.
bool LTRResultIterator::EquivalentToTruth(const char *str) const {
  if (!HasTruthString()) return false;
  ASSERT_HOST(it_->word()->uch_set != NULL);
  WERD_CHOICE str_wd(str, *(it_->word()->uch_set));
  return it_->word()->blamer_bundle->ChoiceIsCorrect(&str_wd);
}
Beispiel #22
0
WERD_CHOICE *split_and_recog_word(                           //recog one owrd
                                  WERD *word,                //word to do
                                  DENORM *denorm,            //de-normaliser
                                  POLY_MATCHER matcher,      //matcher function
                                  POLY_TESTER tester,        //tester function
                                  POLY_TESTER trainer,       //trainer function
                                  BOOL8 testing,             //true if answer driven
                                  WERD_CHOICE *&raw_choice,  //raw result //list of blob lists
                                  BLOB_CHOICE_LIST_CLIST *blob_choices,
                                  WERD *&outword             //bln word output
                                 ) {
  //   inT32                                                      outword1_len;
  //   inT32                                                      outword2_len;
  WERD *first_word;              //poly copy of word
  WERD *second_word;             //fabricated word
  WERD *outword2;                //2nd output word
  PBLOB *blob;
  WERD_CHOICE *result;           //resturn value
  WERD_CHOICE *result2;          //output of 2nd word
  WERD_CHOICE *raw_choice2;      //raw version of 2nd
  float gap;                     //blob gap
  float bestgap;                 //biggest gap
  PBLOB_LIST new_blobs;          //list of gathered blobs
  PBLOB_IT blob_it;
                                 //iterator
  PBLOB_IT new_blob_it = &new_blobs;

  first_word = word->poly_copy (denorm->row ()->x_height ());
  blob_it.set_to_list (first_word->blob_list ());
  bestgap = -MAX_INT32;
  while (!blob_it.at_last ()) {
    blob = blob_it.data ();
                                 //gap to next
    gap = blob_it.data_relative (1)->bounding_box ().left () - blob->bounding_box ().right ();
    blob_it.forward ();
    if (gap > bestgap) {
      bestgap = gap;             //find biggest
      new_blob_it = blob_it;     //save position
    }
  }
                                 //take 2nd half
  new_blobs.assign_to_sublist (&new_blob_it, &blob_it);
                                 //make it a word
  second_word = new WERD (&new_blobs, 1, NULL);
  ASSERT_HOST (word->blob_list ()->length () ==
    first_word->blob_list ()->length () +
    second_word->blob_list ()->length ());

  result = recog_word_recursive (first_word, denorm, matcher,
    tester, trainer, testing, raw_choice,
    blob_choices, outword);
  delete first_word;             //done that one
  result2 = recog_word_recursive (second_word, denorm, matcher,
    tester, trainer, testing, raw_choice2,
    blob_choices, outword2);
  delete second_word;            //done that too
  *result += *result2;           //combine ratings
  delete result2;
  *raw_choice += *raw_choice2;
  delete raw_choice2;            //finished with it
  //   outword1_len= outword->blob_list()->length();
  //   outword2_len= outword2->blob_list()->length();
  outword->join_on (outword2);   //join words
  delete outword2;
  //   if ( outword->blob_list()->length() != outword1_len + outword2_len )
  //      tprintf( "Split&Recog: part1len=%d; part2len=%d; combinedlen=%d\n",
  //                                outword1_len, outword2_len, outword->blob_list()->length() );
  //   ASSERT_HOST( outword->blob_list()->length() == outword1_len + outword2_len );
  return result;
}
// Return the ColumnSpanningType that best explains the columns overlapped
// by the given coords(left,right,y), with the given margins.
// Also return the first and last column index touched by the coords and
// the leftmost spanned column.
// Column indices are 2n + 1 for real columns (0 based) and even values
// represent the gaps in between columns, with 0 being left of the leftmost.
// resolution refers to the ppi resolution of the image.
ColumnSpanningType ColPartitionSet::SpanningType(int resolution,
                                                 int left, int right, int y,
                                                 int left_margin,
                                                 int right_margin,
                                                 int* first_col,
                                                 int* last_col,
                                                 int* first_spanned_col) {
  *first_col = -1;
  *last_col = -1;
  *first_spanned_col = -1;
  int margin_columns = 0;
  ColPartition_IT it(&parts_);
  int col_index = 1;
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), col_index += 2) {
    ColPartition* part = it.data();
    if (part->ColumnContains(left, y)) {
      // In the default case, first_col is set, but columns_spanned remains
      // zero, so first_col will get reset in the first column genuinely
      // spanned, but we can tell the difference from a noise partition
      // that touches no column.
      *first_col = col_index;
      if (part->ColumnContains(right, y)) {
        // Both within a single column.
        *last_col = col_index;
        return CST_FLOWING;
      }
      if (left_margin <= part->LeftAtY(y)) {
        // It completely spans this column.
        *first_spanned_col = col_index;
        margin_columns = 1;
      }
    } else if (part->ColumnContains(right, y)) {
      if (*first_col < 0) {
        // It started in-between.
        *first_col = col_index - 1;
      }
      if (right_margin >= part->RightAtY(y)) {
        // It completely spans this column.
        if (margin_columns == 0)
          *first_spanned_col = col_index;
        ++margin_columns;
      }
      *last_col = col_index;
      break;
    } else if (left < part->LeftAtY(y) && right > part->RightAtY(y)) {
      // Neither left nor right are contained within, so it spans this
      // column.
      if (*first_col < 0) {
        // It started in between the previous column and the current column.
        *first_col = col_index - 1;
      }
      if (margin_columns == 0)
        *first_spanned_col = col_index;
      *last_col = col_index;
    } else if (right < part->LeftAtY(y)) {
      // We have gone past the end.
      *last_col = col_index - 1;
      if (*first_col < 0) {
        // It must lie completely between columns =>noise.
        *first_col = col_index - 1;
      }
      break;
    }
  }
  if (*first_col < 0)
    *first_col = col_index - 1;  // The last in-between.
  if (*last_col < 0)
    *last_col = col_index - 1;  // The last in-between.
  ASSERT_HOST(*first_col >= 0 && *last_col >= 0);
  ASSERT_HOST(*first_col <= *last_col);
  if (*first_col == *last_col && right - left < kMinColumnWidth * resolution) {
    // Neither end was in a column, and it didn't span any, so it lies
    // entirely between columns, therefore noise.
    return CST_NOISE;
  } else if (margin_columns <= 1) {
    // An exception for headings that stick outside of single-column text.
    if (margin_columns == 1 && parts_.singleton()) {
      return CST_HEADING;
    }
    // It is a pullout, as left and right were not in the same column, but
    // it doesn't go to the edge of its start and end.
    return CST_PULLOUT;
  }
  // Its margins went to the edges of first and last columns => heading.
  return CST_HEADING;
}
Beispiel #24
0
/**********************************************************************
 * recog_word
 *
 * Convert the word to tess form and pass it to the tess segmenter.
 * Convert the output back to editor form.
 **********************************************************************/
WERD_CHOICE *recog_word(                           //recog one owrd
                        WERD *word,                //word to do
                        DENORM *denorm,            //de-normaliser
                        POLY_MATCHER matcher,      //matcher function
                        POLY_TESTER tester,        //tester function
                        POLY_TESTER trainer,       //trainer function
                        BOOL8 testing,             //true if answer driven
                        WERD_CHOICE *&raw_choice,  //raw result //list of blob lists
                        BLOB_CHOICE_LIST_CLIST *blob_choices,
                        WERD *&outword             //bln word output
                       ) {
  WERD_CHOICE *word_choice;
  uinT8 perm_type;
  uinT8 real_dict_perm_type;

  if (word->blob_list ()->empty ()) {
    char empty_lengths[] = {0};
    word_choice = new WERD_CHOICE ("", empty_lengths,
                                   10.0f, -1.0f, TOP_CHOICE_PERM);
    raw_choice = new WERD_CHOICE ("", empty_lengths,
                                  10.0f, -1.0f, TOP_CHOICE_PERM);
    outword = word->poly_copy (denorm->row ()->x_height ());
  }
  else
    word_choice = recog_word_recursive (word, denorm, matcher, tester,
      trainer, testing, raw_choice,
      blob_choices, outword);
  if ((word_choice->lengths ().length () !=
    outword->blob_list ()->length ()) ||
  (word_choice->lengths ().length () != blob_choices->length ())) {
    tprintf
      ("recog_word ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
      word_choice->string ().string (), word_choice->lengths ().length (),
      outword->blob_list ()->length (), blob_choices->length ());
  }
  ASSERT_HOST (word_choice->lengths ().length () ==
    outword->blob_list ()->length ());
  ASSERT_HOST (word_choice->lengths ().length () == blob_choices->length ());

  /* Copy any reject blobs into the outword */
  outword->rej_blob_list ()->deep_copy (word->rej_blob_list ());

  if (tessedit_override_permuter) {
    /* Override the permuter type if a straight dictionary check disagrees. */
    perm_type = word_choice->permuter ();
    if ((perm_type != SYSTEM_DAWG_PERM) &&
    (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
      real_dict_perm_type = dict_word (word_choice->string ().string ());
      if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
        (real_dict_perm_type == FREQ_DAWG_PERM) ||
        (real_dict_perm_type == USER_DAWG_PERM)) &&
        (alpha_count (word_choice->string ().string (),
                      word_choice->lengths ().string ()) > 0))
        word_choice->set_permuter (real_dict_perm_type);
      //Use dict perm
    }
    if (tessedit_rejection_debug && perm_type != word_choice->permuter ()) {
      tprintf ("Permuter Type Flipped from %d to %d\n",
        perm_type, word_choice->permuter ());
    }
  }
  assert ((word_choice == NULL) == (raw_choice == NULL));
  return word_choice;
}
Beispiel #25
0
void AssociateUtils::ComputeStats(int col, int row,
                                  const AssociateStats *parent_stats,
                                  int parent_path_length,
                                  bool fixed_pitch,
                                  float max_char_wh_ratio,
                                  WERD_RES *word_res,
                                  bool debug,
                                  AssociateStats *stats) {
  stats->Clear();

  ASSERT_HOST(word_res != NULL);
  if (word_res->blob_widths.empty()) {
    return;
  }
  if (debug) {
    tprintf("AssociateUtils::ComputeStats() for col=%d, row=%d%s\n",
            col, row, fixed_pitch ? " (fixed pitch)" : "");
  }
  float normalizing_height = kBlnXHeight;
  ROW* blob_row = word_res->blob_row;
  // TODO(rays/daria) Can unicharset.script_has_xheight be useful here?
  if (fixed_pitch && blob_row != NULL) {
    // For fixed pitch language like CJK, we use the full text height
    // as the normalizing factor so we are not dependent on xheight
    // calculation.
    if (blob_row->body_size() > 0.0f) {
      normalizing_height = word_res->denorm.y_scale() * blob_row->body_size();
    } else {
      normalizing_height = word_res->denorm.y_scale() *
          (blob_row->x_height() + blob_row->ascenders());
    }
    if (debug) {
      tprintf("normalizing height = %g (scale %g xheight %g ascenders %g)\n",
              normalizing_height, word_res->denorm.y_scale(),
              blob_row->x_height(), blob_row->ascenders());
    }
  }
  float wh_ratio = word_res->GetBlobsWidth(col, row) / normalizing_height;
  if (wh_ratio > max_char_wh_ratio) stats->bad_shape = true;
  // Compute the gap sum for this shape. If there are only negative or only
  // positive gaps, record their sum in stats->gap_sum. However, if there is
  // a mixture, record only the sum of the positive gaps.
  // TODO(antonova): explain fragment.
  int negative_gap_sum = 0;
  for (int c = col; c < row; ++c) {
    int gap = word_res->GetBlobsGap(c);
    (gap > 0) ? stats->gap_sum += gap : negative_gap_sum += gap;
  }
  if (stats->gap_sum == 0) stats->gap_sum = negative_gap_sum;
  if (debug) {
    tprintf("wh_ratio=%g (max_char_wh_ratio=%g) gap_sum=%d %s\n",
            wh_ratio, max_char_wh_ratio, stats->gap_sum,
            stats->bad_shape ? "bad_shape" : "");
  }
  // Compute shape_cost (for fixed pitch mode).
  if (fixed_pitch) {
    bool end_row = (row == (word_res->ratings->dimension() - 1));

    // Ensure that the blob has gaps on the left and the right sides
    // (except for beginning and ending punctuation) and that there is
    // no cutting through ink at the blob boundaries.
    if (col > 0) {
      float left_gap = word_res->GetBlobsGap(col - 1) / normalizing_height;
      SEAM *left_seam = word_res->seam_array[col - 1];
      if ((!end_row && left_gap < kMinGap) || left_seam->priority > 0.0f) {
        stats->bad_shape = true;
      }
      if (debug) {
        tprintf("left_gap %g, left_seam %g %s\n", left_gap, left_seam->priority,
                stats->bad_shape ? "bad_shape" : "");
      }
    }
    float right_gap = 0.0f;
    if (!end_row) {
      right_gap = word_res->GetBlobsGap(row) / normalizing_height;
      SEAM *right_seam = word_res->seam_array[row];
      if (right_gap < kMinGap || right_seam->priority > 0.0f) {
        stats->bad_shape = true;
        if (right_gap < kMinGap) stats->bad_fixed_pitch_right_gap = true;
      }
      if (debug) {
        tprintf("right_gap %g right_seam %g %s\n",
                right_gap, right_seam->priority,
                stats->bad_shape ? "bad_shape" : "");
      }
    }

    // Impose additional segmentation penalties if blob widths or gaps
    // distribution don't fit a fixed-pitch model.
    // Since we only know the widths and gaps of the path explored so far,
    // the means and variances are computed for the path so far (not
    // considering characters to the right of the last character on the path).
    stats->full_wh_ratio = wh_ratio + right_gap;
    if (parent_stats != NULL) {
      stats->full_wh_ratio_total =
        (parent_stats->full_wh_ratio_total + stats->full_wh_ratio);
      float mean =
        stats->full_wh_ratio_total / static_cast<float>(parent_path_length+1);
      stats->full_wh_ratio_var =
        parent_stats->full_wh_ratio_var + pow(mean-stats->full_wh_ratio, 2);
    } else {
      stats->full_wh_ratio_total = stats->full_wh_ratio;
    }
    if (debug) {
      tprintf("full_wh_ratio %g full_wh_ratio_total %g full_wh_ratio_var %g\n",
              stats->full_wh_ratio, stats->full_wh_ratio_total,
              stats->full_wh_ratio_var);
    }

    stats->shape_cost =
      FixedPitchWidthCost(wh_ratio, right_gap, end_row, max_char_wh_ratio);

    // For some reason Tesseract prefers to treat the whole CJ words
    // as one blob when the initial segmentation is particularly bad.
    // This hack is to avoid favoring such states.
    if (col == 0 && end_row && wh_ratio > max_char_wh_ratio) {
      stats->shape_cost += 10;
    }
    stats->shape_cost += stats->full_wh_ratio_var;
    if (debug) tprintf("shape_cost %g\n", stats->shape_cost);
  }
}
Beispiel #26
0
void UnicharAmbigs::LoadUnicharAmbigs(const UNICHARSET& encoder_set,
                                      TFile *ambig_file,
                                      int debug_level,
                                      bool use_ambigs_for_adaption,
                                      UNICHARSET *unicharset) {
  int i, j;
  UnicharIdVector *adaption_ambigs_entry;
  if (debug_level) tprintf("Reading ambiguities\n");

  int test_ambig_part_size;
  int replacement_ambig_part_size;
  // The space for buffer is allocated on the heap to avoid
  // GCC frame size warning.
  const int kBufferSize = 10 + 2 * kMaxAmbigStringSize;
  char *buffer = new char[kBufferSize];
  char replacement_string[kMaxAmbigStringSize];
  UNICHAR_ID test_unichar_ids[MAX_AMBIG_SIZE + 1];
  int line_num = 0;
  int type = NOT_AMBIG;

  // Determine the version of the ambigs file.
  int version = 0;
  ASSERT_HOST(ambig_file->FGets(buffer, kBufferSize) != NULL &&
              strlen(buffer) > 0);
  if (*buffer == 'v') {
    version = static_cast<int>(strtol(buffer+1, NULL, 10));
    ++line_num;
  } else {
    ambig_file->Rewind();
  }
  while (ambig_file->FGets(buffer, kBufferSize) != NULL) {
    chomp_string(buffer);
    if (debug_level > 2) tprintf("read line %s\n", buffer);
    ++line_num;
    if (!ParseAmbiguityLine(line_num, version, debug_level, encoder_set,
                            buffer, &test_ambig_part_size, test_unichar_ids,
                            &replacement_ambig_part_size,
                            replacement_string, &type)) continue;
    // Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST.
    AmbigSpec *ambig_spec = new AmbigSpec();
    if (!InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_
                                                 : dang_ambigs_,
                         test_ambig_part_size, test_unichar_ids,
                         replacement_ambig_part_size, replacement_string, type,
                         ambig_spec, unicharset))
      continue;

    // Update one_to_one_definite_ambigs_.
    if (test_ambig_part_size == 1 &&
        replacement_ambig_part_size == 1 && type == DEFINITE_AMBIG) {
      if (one_to_one_definite_ambigs_[test_unichar_ids[0]] == NULL) {
        one_to_one_definite_ambigs_[test_unichar_ids[0]] = new UnicharIdVector();
      }
      one_to_one_definite_ambigs_[test_unichar_ids[0]]->push_back(
          ambig_spec->correct_ngram_id);
    }
    // Update ambigs_for_adaption_.
    if (use_ambigs_for_adaption) {
      GenericVector<UNICHAR_ID> encoding;
      // Silently ignore invalid strings, as before, so it is safe to use a
      // universal ambigs file.
      if (unicharset->encode_string(replacement_string, true, &encoding,
                                    NULL, NULL)) {
        for (i = 0; i < test_ambig_part_size; ++i) {
          if (ambigs_for_adaption_[test_unichar_ids[i]] == NULL) {
            ambigs_for_adaption_[test_unichar_ids[i]] = new UnicharIdVector();
          }
          adaption_ambigs_entry = ambigs_for_adaption_[test_unichar_ids[i]];
          for (int r = 0; r < encoding.size(); ++r) {
            UNICHAR_ID id_to_insert = encoding[r];
            ASSERT_HOST(id_to_insert != INVALID_UNICHAR_ID);
            // Add the new unichar id to adaption_ambigs_entry (only if the
            // vector does not already contain it) keeping it in sorted order.
            for (j = 0; j < adaption_ambigs_entry->size() &&
                 (*adaption_ambigs_entry)[j] > id_to_insert; ++j);
            if (j < adaption_ambigs_entry->size()) {
              if ((*adaption_ambigs_entry)[j] != id_to_insert) {
                adaption_ambigs_entry->insert(id_to_insert, j);
              }
            } else {
              adaption_ambigs_entry->push_back(id_to_insert);
            }
          }
        }
      }
    }
  }
  delete[] buffer;

  // Fill in reverse_ambigs_for_adaption from ambigs_for_adaption vector.
  if (use_ambigs_for_adaption) {
    for (i = 0; i < ambigs_for_adaption_.size(); ++i) {
      adaption_ambigs_entry = ambigs_for_adaption_[i];
      if (adaption_ambigs_entry == NULL) continue;
      for (j = 0; j < adaption_ambigs_entry->size(); ++j) {
        UNICHAR_ID ambig_id = (*adaption_ambigs_entry)[j];
        if (reverse_ambigs_for_adaption_[ambig_id] == NULL) {
          reverse_ambigs_for_adaption_[ambig_id] = new UnicharIdVector();
        }
        reverse_ambigs_for_adaption_[ambig_id]->push_back(i);
      }
    }
  }

  // Print what was read from the input file.
  if (debug_level > 1) {
    for (int tbl = 0; tbl < 2; ++tbl) {
      const UnicharAmbigsVector &print_table =
        (tbl == 0) ? replace_ambigs_ : dang_ambigs_;
      for (i = 0; i < print_table.size(); ++i) {
        AmbigSpec_LIST *lst = print_table[i];
        if (lst == NULL) continue;
        if (!lst->empty()) {
          tprintf("%s Ambiguities for %s:\n",
                  (tbl == 0) ? "Replaceable" : "Dangerous",
                  unicharset->debug_str(i).string());
        }
        AmbigSpec_IT lst_it(lst);
        for (lst_it.mark_cycle_pt(); !lst_it.cycled_list(); lst_it.forward()) {
          AmbigSpec *ambig_spec = lst_it.data();
          tprintf("wrong_ngram:");
          UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, *unicharset);
          tprintf("correct_fragments:");
          UnicharIdArrayUtils::print(ambig_spec->correct_fragments, *unicharset);
        }
      }
    }
    if (use_ambigs_for_adaption) {
      for (int vec_id = 0; vec_id < 2; ++vec_id) {
        const GenericVector<UnicharIdVector *> &vec = (vec_id == 0) ?
          ambigs_for_adaption_ : reverse_ambigs_for_adaption_;
        for (i = 0; i < vec.size(); ++i) {
          adaption_ambigs_entry = vec[i];
          if (adaption_ambigs_entry != NULL) {
            tprintf("%sAmbigs for adaption for %s:\n",
                    (vec_id == 0) ? "" : "Reverse ",
                    unicharset->debug_str(i).string());
            for (j = 0; j < adaption_ambigs_entry->size(); ++j) {
              tprintf("%s ", unicharset->debug_str(
                  (*adaption_ambigs_entry)[j]).string());
            }
            tprintf("\n");
          }
        }
      }
    }
  }
}
// Extracts the needed information from the CHAR_DESC_STRUCT.
void TrainingSample::ExtractCharDesc(int int_feature_type,
                                     int micro_type,
                                     int cn_type,
                                     int geo_type,
                                     CHAR_DESC_STRUCT* char_desc) {
    // Extract the INT features.
    if (features_ != NULL) delete [] features_;
    FEATURE_SET_STRUCT* char_features = char_desc->FeatureSets[int_feature_type];
    if (char_features == NULL) {
        tprintf("Error: no features to train on of type %s\n",
                kIntFeatureType);
        num_features_ = 0;
        features_ = NULL;
    } else {
        num_features_ = char_features->NumFeatures;
        features_ = new INT_FEATURE_STRUCT[num_features_];
        for (int f = 0; f < num_features_; ++f) {
            features_[f].X =
                static_cast<uinT8>(char_features->Features[f]->Params[IntX]);
            features_[f].Y =
                static_cast<uinT8>(char_features->Features[f]->Params[IntY]);
            features_[f].Theta =
                static_cast<uinT8>(char_features->Features[f]->Params[IntDir]);
            features_[f].CP_misses = 0;
        }
    }
    // Extract the Micro features.
    if (micro_features_ != NULL) delete [] micro_features_;
    char_features = char_desc->FeatureSets[micro_type];
    if (char_features == NULL) {
        tprintf("Error: no features to train on of type %s\n",
                kMicroFeatureType);
        num_micro_features_ = 0;
        micro_features_ = NULL;
    } else {
        num_micro_features_ = char_features->NumFeatures;
        micro_features_ = new MicroFeature[num_micro_features_];
        for (int f = 0; f < num_micro_features_; ++f) {
            for (int d = 0; d < MFCount; ++d) {
                micro_features_[f][d] = char_features->Features[f]->Params[d];
            }
        }
    }
    // Extract the CN feature.
    char_features = char_desc->FeatureSets[cn_type];
    if (char_features == NULL) {
        tprintf("Error: no CN feature to train on.\n");
    } else {
        ASSERT_HOST(char_features->NumFeatures == 1);
        cn_feature_[CharNormY] = char_features->Features[0]->Params[CharNormY];
        cn_feature_[CharNormLength] =
            char_features->Features[0]->Params[CharNormLength];
        cn_feature_[CharNormRx] = char_features->Features[0]->Params[CharNormRx];
        cn_feature_[CharNormRy] = char_features->Features[0]->Params[CharNormRy];
    }
    // Extract the Geo feature.
    char_features = char_desc->FeatureSets[geo_type];
    if (char_features == NULL) {
        tprintf("Error: no Geo feature to train on.\n");
    } else {
        ASSERT_HOST(char_features->NumFeatures == 1);
        geo_feature_[GeoBottom] = char_features->Features[0]->Params[GeoBottom];
        geo_feature_[GeoTop] = char_features->Features[0]->Params[GeoTop];
        geo_feature_[GeoWidth] = char_features->Features[0]->Params[GeoWidth];
    }
    features_are_indexed_ = false;
    features_are_mapped_ = false;
}
// Classifies the given [training] sample, writing to results.
// See shapeclassifier.h for a full description.
// Default implementation aborts.
int ShapeClassifier::ClassifySample(const TrainingSample& sample, Pix* page_pix,
                           int debug, int keep_this,
                           GenericVector<ShapeRating>* results) {
  ASSERT_HOST("Must implement ClassifySample!" == NULL);
  return 0;
}
Beispiel #29
0
void UnicharAmbigs::LoadUnicharAmbigs(FILE *AmbigFile, inT64 end_offset,
                                      UNICHARSET *unicharset) {
  int i;
  for (i = 0; i < unicharset->size(); ++i) {
    replace_ambigs_.push_back(NULL);
    dang_ambigs_.push_back(NULL);
    one_to_one_definite_ambigs_.push_back(NULL);
  }
  if (global_ambigs_debug_level) tprintf("Reading ambiguities\n");

  int TestAmbigPartSize;
  int ReplacementAmbigPartSize;
  // Maximum line size:
  //   10 for sizes of ambigs, tabs, abmig type and newline
  //   UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig
  // The space for buffer is allocated on the heap to avoid
  // GCC frame size warning.
  const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1);
  const int kBufferSize = 10 + 2 * kMaxAmbigStringSize;
  char *buffer = new char[kBufferSize];
  char ReplacementString[kMaxAmbigStringSize];
  UNICHAR_ID TestUnicharIds[MAX_AMBIG_SIZE + 1];
  int line_num = 0;
  int type = NOT_AMBIG;

  // Determine the version of the ambigs file.
  int version = 0;
  ASSERT_HOST(fgets(buffer, kBufferSize, AmbigFile) != NULL &&
              strlen(buffer) > 0);
  if (*buffer == 'v') {
    version = static_cast<int>(strtol(buffer+1, NULL, 10));
    ++line_num;
  } else {
    rewind(AmbigFile);
  }
  while ((end_offset < 0 || ftell(AmbigFile) < end_offset) &&
         fgets(buffer, kBufferSize, AmbigFile) != NULL) {
    chomp_string(buffer);
    if (global_ambigs_debug_level > 2) tprintf("read line %s\n", buffer);
    ++line_num;
    if (!ParseAmbiguityLine(line_num, version, *unicharset, buffer,
                            &TestAmbigPartSize, TestUnicharIds,
                            &ReplacementAmbigPartSize,
                            ReplacementString, &type)) continue;
    // Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST.
    AmbigSpec *ambig_spec = new AmbigSpec();
    InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_ : dang_ambigs_,
                    TestAmbigPartSize, TestUnicharIds,
                    ReplacementAmbigPartSize, ReplacementString, type,
                    ambig_spec, unicharset);

    // Update one_to_one_definite_ambigs_.
    if (use_definite_ambigs_for_classifier && TestAmbigPartSize == 1 &&
        ReplacementAmbigPartSize == 1 && type == DEFINITE_AMBIG) {
      if (one_to_one_definite_ambigs_[TestUnicharIds[0]] == NULL) {
        one_to_one_definite_ambigs_[TestUnicharIds[0]] = new UnicharIdVector();
      }
      one_to_one_definite_ambigs_[TestUnicharIds[0]]->push_back(
          ambig_spec->correct_ngram_id);
    }
  }
  delete[] buffer;
  // Print what was read from the input file.
  if (global_ambigs_debug_level > 2) {
    for (int tbl = 0; tbl < 2; ++tbl) {
      const UnicharAmbigsVector &print_table =
        (tbl == 0) ? replace_ambigs_ : dang_ambigs_;
      for (i = 0; i < print_table.size(); ++i) {
        AmbigSpec_LIST *lst = print_table[i];
        if (lst == NULL) continue;
        if (!lst->empty()) {
          tprintf("%s Ambiguities for %s:\n",
                  (tbl == 0) ? "Replaceable" : "Dangerous",
                  unicharset->debug_str(i).string());
        }
        AmbigSpec_IT lst_it(lst);
        for (lst_it.mark_cycle_pt(); !lst_it.cycled_list(); lst_it.forward()) {
          AmbigSpec *ambig_spec = lst_it.data();
          tprintf("wrong_ngram:");
          UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, *unicharset);
          tprintf("correct_fragments:");
          UnicharIdArrayUtils::print(ambig_spec->correct_fragments, *unicharset);
        }
      }
    }
  }
}
Beispiel #30
0
// Adds the dw_ in other to the dw_ is *this.
void WeightMatrix::AddDeltas(const WeightMatrix& other) {
  ASSERT_HOST(dw_.dim1() == other.dw_.dim1());
  ASSERT_HOST(dw_.dim2() == other.dw_.dim2());
  dw_ += other.dw_;
}