示例#1
0
/**********************************************************************
 * get_piece_rating
 *
 * Check to see if this piece has already been classified.  If it has
 * return that rating.  Otherwise build the piece from the smaller
 * pieces, classify it, store the rating for later, and take the piece
 * apart again.
 **********************************************************************/
CHOICES get_piece_rating(MATRIX ratings,
                         TBLOB *blobs,
                         SEAMS seams,
                         INT16 start,
                         INT16 end,
                         INT32 fx,
                         STATE *this_state,
                         STATE *best_state,
                         INT32 pass,
                         INT32 blob_index) {
  CHOICES choices;

  choices = matrix_get (ratings, start, end);
  if (choices == NOT_CLASSIFIED) {
    choices =
      classify_piece(blobs,
                     seams,
                     start,
                     end,
                     fx,
                     this_state,
                     best_state,
                     pass,
                     blob_index);
    matrix_put(ratings, start, end, choices); 
  }
  return (choices);
}
示例#2
0
void Wordrec::ProcessSegSearchPainPoint(
    float pain_point_priority,
    const MATRIX_COORD &pain_point, const char* pain_point_type,
    GenericVector<SegSearchPending>* pending, WERD_RES *word_res,
    LMPainPoints *pain_points, BlamerBundle *blamer_bundle) {
  if (segsearch_debug_level > 0) {
    tprintf("Classifying pain point %s priority=%.4f, col=%d, row=%d\n",
            pain_point_type, pain_point_priority,
            pain_point.col, pain_point.row);
  }
  ASSERT_HOST(pain_points != NULL);
  MATRIX *ratings = word_res->ratings;
  // Classify blob [pain_point.col pain_point.row]
  if (!pain_point.Valid(*ratings)) {
    ratings->IncreaseBandSize(pain_point.row + 1 - pain_point.col);
  }
  ASSERT_HOST(pain_point.Valid(*ratings));
  BLOB_CHOICE_LIST *classified = classify_piece(word_res->seam_array,
                                                pain_point.col, pain_point.row,
                                                pain_point_type,
                                                word_res->chopped_word,
                                                blamer_bundle);
  BLOB_CHOICE_LIST *lst = ratings->get(pain_point.col, pain_point.row);
  if (lst == NULL) {
    ratings->put(pain_point.col, pain_point.row, classified);
  } else {
    // We can not delete old BLOB_CHOICEs, since they might contain
    // ViterbiStateEntries that are parents of other "active" entries.
    // Thus if the matrix cell already contains classifications we add
    // the new ones to the beginning of the list.
    BLOB_CHOICE_IT it(lst);
    it.add_list_before(classified);
    delete classified;  // safe to delete, since empty after add_list_before()
    classified = NULL;
  }

  if (segsearch_debug_level > 0) {
    print_ratings_list("Updated ratings matrix with a new entry:",
                       ratings->get(pain_point.col, pain_point.row),
                       getDict().getUnicharset());
    ratings->print(getDict().getUnicharset());
  }

  // Insert initial "pain points" to join the newly classified blob
  // with its left and right neighbors.
  if (classified != NULL && !classified->empty()) {
    if (pain_point.col > 0) {
      pain_points->GeneratePainPoint(
          pain_point.col - 1, pain_point.row, LM_PPTYPE_SHAPE, 0.0,
          true, segsearch_max_char_wh_ratio, word_res);
    }
    if (pain_point.row + 1 < ratings->dimension()) {
      pain_points->GeneratePainPoint(
          pain_point.col, pain_point.row + 1, LM_PPTYPE_SHAPE, 0.0,
          true, segsearch_max_char_wh_ratio, word_res);
    }
  }
  (*pending)[pain_point.col].SetBlobClassified(pain_point.row);
}
示例#3
0
/**
 * @name chop_word_main
 *
 * Classify the blobs in this word and permute the results.  Find the
 * worst blob in the word and chop it up.  Continue this process until
 * a good answer has been found or all the blobs have been chopped up
 * enough.  The results are returned in the WERD_RES.
 */
void Wordrec::chop_word_main(WERD_RES *word) {
  int num_blobs = word->chopped_word->NumBlobs();
  if (word->ratings == NULL) {
    word->ratings = new MATRIX(num_blobs, wordrec_max_join_chunks);
  }
  if (word->ratings->get(0, 0) == NULL) {
    // Run initial classification.
    for (int b = 0; b < num_blobs; ++b) {
      BLOB_CHOICE_LIST* choices = classify_piece(word->seam_array, b, b,
                                                 "Initial:", word->chopped_word,
                                                 word->blamer_bundle);
      word->ratings->put(b, b, choices);
    }
  } else {
    // Blobs have been pre-classified. Set matrix cell for all blob choices
    for (int col = 0; col < word->ratings->dimension(); ++col) {
      for (int row = col; row < word->ratings->dimension() &&
           row < col + word->ratings->bandwidth(); ++row) {
        BLOB_CHOICE_LIST* choices = word->ratings->get(col, row);
        if (choices != NULL) {
          BLOB_CHOICE_IT bc_it(choices);
          for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
            bc_it.data()->set_matrix_cell(col, row);
          }
        }
      }
    }
  }

  // Run Segmentation Search.
  BestChoiceBundle best_choice_bundle(word->ratings->dimension());
  SegSearch(word, &best_choice_bundle, word->blamer_bundle);

  if (word->best_choice == NULL) {
    // SegSearch found no valid paths, so just use the leading diagonal.
    word->FakeWordFromRatings();
  }
  word->RebuildBestState();
  // If we finished without a hyphen at the end of the word, let the next word
  // be found in the dictionary.
  if (word->word->flag(W_EOL) &&
      !getDict().has_hyphen_end(*word->best_choice)) {
    getDict().reset_hyphen_vars(true);
  }

  if (word->blamer_bundle != NULL && this->fill_lattice_ != NULL) {
    CallFillLattice(*word->ratings, word->best_choices,
                    *word->uch_set, word->blamer_bundle);
  }
  if (wordrec_debug_level > 0) {
    tprintf("Final Ratings Matrix:\n");
    word->ratings->print(getDict().getUnicharset());
  }
  word->FilterWordChoices(getDict().stopper_debug_level);
}
示例#4
0
/// Resegments the word to achieve the target_text from the classifier.
/// Returns false if the re-segmentation fails.
/// Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and
/// applies a full search on the classifier results to find the best classified
/// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
/// substitutions ARE used.
bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
                                 WERD_RES* word_res) {
  // Classify all required combinations of blobs and save results in choices.
  int word_length = word_res->box_word->length();
  GenericVector<BLOB_CHOICE_LIST*>* choices =
      new GenericVector<BLOB_CHOICE_LIST*>[word_length];
  for (int i = 0; i < word_length; ++i) {
    for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
      BLOB_CHOICE_LIST* match_result = classify_piece(
          word_res->seam_array, i, i + j - 1, "Applybox",
          word_res->chopped_word, word_res->blamer_bundle);
      if (applybox_debug > 2) {
        tprintf("%d+%d:", i, j);
        print_ratings_list("Segment:", match_result, unicharset);
      }
      choices[i].push_back(match_result);
    }
  }
  // Search the segmentation graph for the target text. Must be an exact
  // match. Using wildcards makes it difficult to find the correct
  // segmentation even when it is there.
  word_res->best_state.clear();
  GenericVector<int> search_segmentation;
  float best_rating = 0.0f;
  SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
                &search_segmentation, &best_rating, &word_res->best_state);
  for (int i = 0; i < word_length; ++i)
    choices[i].delete_data_pointers();
  delete [] choices;
  if (word_res->best_state.empty()) {
    // Build the original segmentation and if it is the same length as the
    // truth, assume it will do.
    int blob_count = 1;
    for (int s = 0; s < word_res->seam_array.size(); ++s) {
      SEAM* seam = word_res->seam_array[s];
      if (!seam->HasAnySplits()) {
        word_res->best_state.push_back(blob_count);
        blob_count = 1;
      } else {
        ++blob_count;
      }
    }
    word_res->best_state.push_back(blob_count);
    if (word_res->best_state.size() != target_text.size()) {
      word_res->best_state.clear();  // No good. Original segmentation bad size.
      return false;
    }
  }
  word_res->correct_text.clear();
  for (int i = 0; i < target_text.size(); ++i) {
    word_res->correct_text.push_back(
        STRING(unicharset.id_to_unichar(target_text[i])));
  }
  return true;
}
/**********************************************************************
 * get_piece_rating
 *
 * Check to see if this piece has already been classified.  If it has
 * return that rating.  Otherwise build the piece from the smaller
 * pieces, classify it, store the rating for later, and take the piece
 * apart again.
 **********************************************************************/
BLOB_CHOICE_LIST *Wordrec::get_piece_rating(MATRIX *ratings,
                                            TBLOB *blobs,
                                            SEAMS seams,
                                            inT16 start,
                                            inT16 end) {
  BLOB_CHOICE_LIST *choices = ratings->get(start, end);
  if (choices == NOT_CLASSIFIED) {
    choices = classify_piece(blobs,
                             seams,
                             start,
                             end);
    ratings->put(start, end, choices);
  }
  return (choices);
}
示例#6
0
/**********************************************************************
 * get_piece_rating
 *
 * Check to see if this piece has already been classified.  If it has
 * return that rating.  Otherwise build the piece from the smaller
 * pieces, classify it, store the rating for later, and take the piece
 * apart again.
 **********************************************************************/
BLOB_CHOICE_LIST *Wordrec::get_piece_rating(MATRIX *ratings,
                                            TBLOB *blobs,
                                            const DENORM& denorm,
                                            SEAMS seams,
                                            inT16 start,
                                            inT16 end,
                                            BlamerBundle *blamer_bundle) {
  BLOB_CHOICE_LIST *choices = ratings->get(start, end);
  if (choices == NOT_CLASSIFIED) {
    choices = classify_piece(blobs,
                             denorm,
                             seams,
                             start,
                             end,
                             blamer_bundle);
    ratings->put(start, end, choices);
    if (wordrec_debug_level > 1) {
      tprintf("get_piece_rating(): updated ratings matrix\n");
      ratings->print(getDict().getUnicharset());
    }
  }
  return (choices);
}
示例#7
0
void Wordrec::SegSearch(CHUNKS_RECORD *chunks_record,
                        WERD_CHOICE *best_choice,
                        BLOB_CHOICE_LIST_VECTOR *best_char_choices,
                        WERD_CHOICE *raw_choice,
                        STATE *output_best_state) {
  int row, col = 0;
  if (segsearch_debug_level > 0) {
    tprintf("Starting SegSearch on ratings matrix:\n");
    chunks_record->ratings->print(getDict().getUnicharset());
  }
  // Start with a fresh best_choice since rating adjustments
  // used by the chopper and the new segmentation search are not compatible.
  best_choice->set_rating(WERD_CHOICE::kBadRating);
  // Clear best choice accumulator (that is used for adaption), so that
  // choices adjusted by chopper do not interfere with the results from the
  // segmentation search.
  getDict().ClearBestChoiceAccum();

  MATRIX *ratings = chunks_record->ratings;
  // Priority queue containing pain points generated by the language model
  // The priority is set by the language model components, adjustments like
  // seam cost and width priority are factored into the priority.
  HEAP *pain_points = MakeHeap(segsearch_max_pain_points);

  // best_path_by_column records the lowest cost path found so far for each
  // column of the chunks_record->ratings matrix over all the rows.
  BestPathByColumn *best_path_by_column =
    new BestPathByColumn[ratings->dimension()];
  for (col = 0; col < ratings->dimension(); ++col) {
    best_path_by_column[col].avg_cost = WERD_CHOICE::kBadRating;
    best_path_by_column[col].best_vse = NULL;
  }

  language_model_->InitForWord(prev_word_best_choice_, &denorm_,
                               assume_fixed_pitch_char_segment,
                               best_choice->certainty(),
                               segsearch_max_char_wh_ratio,
                               pain_points, chunks_record);

  MATRIX_COORD *pain_point;
  float pain_point_priority;
  BestChoiceBundle best_choice_bundle(
      output_best_state, best_choice, raw_choice, best_char_choices);

  // pending[i] stores a list of the parent/child pair of BLOB_CHOICE_LISTs,
  // where i is the column of the child. Initially all the classified entries
  // in the ratings matrix from column 0 (with parent NULL) are inserted into
  // pending[0]. As the language model state is updated, new child/parent
  // pairs are inserted into the lists. Next, the entries in pending[1] are
  // considered, and so on. It is important that during the update the
  // children are considered in the non-decreasing order of their column, since
  // this guarantess that all the parents would be up to date before an update
  // of a child is done.
  SEG_SEARCH_PENDING_LIST *pending =
    new SEG_SEARCH_PENDING_LIST[ratings->dimension()];

  // Search for the ratings matrix for the initial best path.
  for (row = 0; row < ratings->dimension(); ++row) {
    if (ratings->get(0, row) != NOT_CLASSIFIED) {
      pending[0].add_sorted(
          SEG_SEARCH_PENDING::compare, true,
          new SEG_SEARCH_PENDING(row, NULL, LanguageModel::kAllChangedFlag));
    }
  }
  UpdateSegSearchNodes(0, &pending, &best_path_by_column, chunks_record,
                       pain_points, &best_choice_bundle);

  // Keep trying to find a better path by fixing the "pain points".
  int num_futile_classifications = 0;
  while (!(language_model_->AcceptableChoiceFound() ||
           num_futile_classifications >=
           segsearch_max_futile_classifications)) {
    // Get the next valid "pain point".
    int pop;
    while (true) {
      pop = HeapPop(pain_points, &pain_point_priority, &pain_point);
      if (pop == EMPTY) break;
      if (pain_point->Valid(*ratings) &&
        ratings->get(pain_point->col, pain_point->row) == NOT_CLASSIFIED) {
        break;
      } else {
        delete pain_point;
      }
    }
    if (pop == EMPTY) {
      if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n");
      break;
    }
    if (segsearch_debug_level > 0) {
      tprintf("Classifying pain point priority=%.4f, col=%d, row=%d\n",
              pain_point_priority, pain_point->col, pain_point->row);
    }
    BLOB_CHOICE_LIST *classified = classify_piece(
        chunks_record->chunks, chunks_record->splits,
        pain_point->col, pain_point->row);
    ratings->put(pain_point->col, pain_point->row, classified);

    if (segsearch_debug_level > 0) {
      print_ratings_list("Updated ratings matrix with a new entry:",
                         ratings->get(pain_point->col, pain_point->row),
                         getDict().getUnicharset());
      chunks_record->ratings->print(getDict().getUnicharset());
    }

    // Insert initial "pain points" to join the newly classified blob
    // with its left and right neighbors.
    if (!classified->empty()) {
      float worst_piece_cert;
      bool fragmented;
      if (pain_point->col > 0) {
        language_model_->GetWorstPieceCertainty(
            pain_point->col-1, pain_point->row, chunks_record->ratings,
            &worst_piece_cert, &fragmented);
        language_model_->GeneratePainPoint(
            pain_point->col-1, pain_point->row, false,
            LanguageModel::kInitialPainPointPriorityAdjustment,
            worst_piece_cert, fragmented, best_choice->certainty(),
            segsearch_max_char_wh_ratio, NULL, NULL,
            chunks_record, pain_points);
      }
      if (pain_point->row+1 < ratings->dimension()) {
        language_model_->GetWorstPieceCertainty(
            pain_point->col, pain_point->row+1, chunks_record->ratings,
            &worst_piece_cert, &fragmented);
        language_model_->GeneratePainPoint(
            pain_point->col, pain_point->row+1, true,
            LanguageModel::kInitialPainPointPriorityAdjustment,
            worst_piece_cert, fragmented, best_choice->certainty(),
            segsearch_max_char_wh_ratio, NULL, NULL,
            chunks_record, pain_points);
      }
    }

    // Record a pending entry with the pain_point and each of its parents.
    int parent_row = pain_point->col - 1;
    if (parent_row < 0) {  // this node has no parents
      pending[pain_point->col].add_sorted(
          SEG_SEARCH_PENDING::compare, true,
          new SEG_SEARCH_PENDING(pain_point->row, NULL,
                                 LanguageModel::kAllChangedFlag));
    } else {
      for (int parent_col = 0; parent_col < pain_point->col; ++parent_col) {
        if (ratings->get(parent_col, parent_row) != NOT_CLASSIFIED) {
          pending[pain_point->col].add_sorted(
              SEG_SEARCH_PENDING::compare, true,
              new SEG_SEARCH_PENDING(pain_point->row,
                                     ratings->get(parent_col, parent_row),
                                     LanguageModel::kAllChangedFlag));
        }
      }
    }
    UpdateSegSearchNodes(pain_point->col, &pending, &best_path_by_column,
                         chunks_record, pain_points, &best_choice_bundle);
    if (!best_choice_bundle.updated) ++num_futile_classifications;

    if (segsearch_debug_level > 0) {
      tprintf("num_futile_classifications %d\n", num_futile_classifications);
    }

    // Clean up
    best_choice_bundle.updated = false;
    delete pain_point;  // done using this pain point
  }

  if (segsearch_debug_level > 0) {
    tprintf("Done with SegSearch (AcceptableChoiceFound: %d\n",
            language_model_->AcceptableChoiceFound());
  }

  // Clean up.
  FreeHeapData(pain_points, MATRIX_COORD::Delete);
  delete[] best_path_by_column;
  delete[] pending;
  for (row = 0; row < ratings->dimension(); ++row) {
    for (col = 0; col <= row; ++col) {
      BLOB_CHOICE_LIST *rating = ratings->get(col, row);
      if (rating != NOT_CLASSIFIED) language_model_->DeleteState(rating);
    }
  }
}