Ejemplo n.º 1
0
/**
 * @name improve_by_chopping
 *
 * Start with the current word of blobs and its classification.  Find
 * the worst blobs and try to divide them up to improve the ratings.
 * As long as ratings are produced by the new blob splitting.  When
 * all the splitting has been accomplished all the ratings memory is
 * reclaimed.
 */
void Wordrec::improve_by_chopping(WERD_RES *word,
                                  BLOB_CHOICE_LIST_VECTOR *char_choices,
                                  STATE *best_state,
                                  BLOB_CHOICE_LIST_VECTOR *best_char_choices,
                                  DANGERR *fixpt,
                                  bool *best_choice_acceptable) {
  inT32 blob_number;
  float old_best;
  int fixpt_valid = 1;
  bool updated_best_choice = false;

  while (1) {  // improvement loop
    if (!fixpt_valid) fixpt->clear();
    old_best = word->best_choice->rating();
    if (improve_one_blob(word->chopped_word, char_choices,
                         &blob_number, &word->seam_array,
                         fixpt, (fragments_guide_chopper &&
                                 word->best_choice->fragment_mark()))) {
      getDict().LogNewSplit(blob_number);
      updated_best_choice =
        getDict().permute_characters(*char_choices, word->best_choice,
                                     word->raw_choice);

      if (old_best > word->best_choice->rating()) {
        set_n_ones(best_state, char_choices->length() - 1);
        fixpt_valid = 1;
      }
      else {
        insert_new_chunk(best_state, blob_number, char_choices->length() - 2);
        fixpt_valid = 0;
      }

      if (chop_debug)
        print_state("best state = ",
          best_state, count_blobs(word->chopped_word->blobs) - 1);
    } else {
      break;
    }

    // Check if we should break from the loop.
    bool done = false;
    bool replaced = false;
    if ((updated_best_choice &&
         (*best_choice_acceptable =
          getDict().AcceptableChoice(char_choices, word->best_choice,
                                     fixpt, CHOPPER_CALLER, &replaced))) ||
        char_choices->length() >= MAX_NUM_CHUNKS) {
      done = true;
    }
    if (replaced) update_blob_classifications(word->chopped_word,
                                              *char_choices);
    if (updated_best_choice) CopyCharChoices(*char_choices, best_char_choices);
    if (done) break;
  }
  if (!fixpt_valid) fixpt->clear();
}
Ejemplo n.º 2
0
/**
 * @name evaluate_state
 *
 * Evaluate the segmentation that is represented by this state in the
 * best first search.  Add this state to the "states_seen" list.
 */
inT16 Wordrec::evaluate_state(CHUNKS_RECORD *chunks_record,
                              SEARCH_RECORD *the_search,
                              DANGERR *fixpt,
                              BlamerBundle *blamer_bundle) {
  BLOB_CHOICE_LIST_VECTOR *char_choices;
  SEARCH_STATE chunk_groups;
  float rating_limit = the_search->best_choice->rating();
  bool keep_going = true;
  PIECES_STATE widths;

  the_search->num_states++;
  chunk_groups = bin_to_chunks(the_search->this_state,
                               the_search->num_joints);
  bin_to_pieces (the_search->this_state, the_search->num_joints, widths);
  if (wordrec_debug_level > 1) {
    log_state("Evaluating state", the_search->num_joints,
              the_search->this_state);
  }
  getDict().LogNewSegmentation(widths);

  char_choices = evaluate_chunks(chunks_record, chunk_groups, blamer_bundle);
  getDict().SetWordsegRatingAdjustFactor(-1.0f);
  bool updated_best_choice = false;
  if (char_choices != NULL && char_choices->length() > 0) {
    // Compute the segmentation cost and include the cost in word rating.
    // TODO(dsl): We should change the SEARCH_RECORD to store this cost
    // from state evaluation and avoid recomputing it here.
    prioritize_state(chunks_record, the_search);
    getDict().SetWordsegRatingAdjustFactor(the_search->segcost_bias);
    updated_best_choice =
      getDict().permute_characters(*char_choices,
                                   the_search->best_choice,
                                   the_search->raw_choice);
    bool replaced = false;
    if (updated_best_choice) {
      if (getDict().AcceptableChoice(char_choices, the_search->best_choice,
                                     NULL, ASSOCIATOR_CALLER, &replaced)) {
        keep_going = false;
      }
      CopyCharChoices(*char_choices, the_search->best_char_choices);
    }
  }
  getDict().SetWordsegRatingAdjustFactor(-1.0f);

#ifndef GRAPHICS_DISABLED
  if (wordrec_display_segmentations) {
    display_segmentation (chunks_record->chunks, chunk_groups);
    if (wordrec_display_segmentations > 1)
      window_wait(segm_window);
  }
#endif

  if (rating_limit != the_search->best_choice->rating()) {
    ASSERT_HOST(updated_best_choice);
    the_search->before_best = the_search->num_states;
    the_search->best_state->part1 = the_search->this_state->part1;
    the_search->best_state->part2 = the_search->this_state->part2;
    replace_char_widths(chunks_record, chunk_groups);
  } else {
    ASSERT_HOST(!updated_best_choice);
    if (char_choices != NULL) fixpt->clear();
  }

  if (char_choices != NULL) delete char_choices;
  memfree(chunk_groups);

  return (keep_going);
}
Ejemplo n.º 3
0
/**
 * @name chop_word_main
 *
 * Classify the blobs in this word and permute the results.  Find the
 * worst blob in the word and chop it up.  Continue this process until
 * a good answer has been found or all the blobs have been chopped up
 * enough.  Return the word level ratings.
 */
BLOB_CHOICE_LIST_VECTOR *Wordrec::chop_word_main(WERD_RES *word) {
  TBLOB *blob;
  int index;
  int did_chopping;
  STATE state;
  BLOB_CHOICE_LIST *match_result;
  MATRIX *ratings = NULL;
  DANGERR fixpt;                 /*dangerous ambig */
  inT32 bit_count;               //no of bits

  set_denorm(&word->denorm);

  BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
  BLOB_CHOICE_LIST_VECTOR *best_char_choices = new BLOB_CHOICE_LIST_VECTOR();

  did_chopping = 0;
  for (blob = word->chopped_word->blobs, index = 0;
       blob != NULL; blob = blob->next, index++) {
    match_result = classify_blob(blob, "chop_word:", Green);
    if (match_result == NULL)
      cprintf("Null classifier output!\n");
    *char_choices += match_result;
  }
  bit_count = index - 1;
  set_n_ones(&state, char_choices->length() - 1);
  bool acceptable = false;
  bool replaced = false;
  bool best_choice_updated =
    getDict().permute_characters(*char_choices, word->best_choice,
                                 word->raw_choice);
  if (best_choice_updated &&
      getDict().AcceptableChoice(char_choices, word->best_choice, &fixpt,
                                 CHOPPER_CALLER, &replaced)) {
    acceptable = true;
  }
  if (replaced)
    update_blob_classifications(word->chopped_word, *char_choices);
  CopyCharChoices(*char_choices, best_char_choices);
  if (!acceptable) {  // do more work to find a better choice
    did_chopping = 1;

    bool best_choice_acceptable = false;
    if (chop_enable)
      improve_by_chopping(word,
                          char_choices,
                          &state,
                          best_char_choices,
                          &fixpt,
                          &best_choice_acceptable);
    if (chop_debug)
      print_seams ("Final seam list:", word->seam_array);

    // The force_word_assoc is almost redundant to enable_assoc.  However,
    // it is not conditioned on the dict behavior.  For CJK, we need to force
    // the associator to be invoked.  When we figure out the exact behavior
    // of dict on CJK, we can remove the flag if it turns out to be redundant.
    if ((wordrec_enable_assoc && !best_choice_acceptable) || force_word_assoc) {
      ratings = word_associator(word, &state, best_char_choices,
                                &fixpt, &state);
    }
  }
  best_char_choices = rebuild_current_state(word, &state, best_char_choices,
                                            ratings);
  if (ratings != NULL) {
    if (wordrec_debug_level > 0) {
      tprintf("Final Ratings Matrix:\n");
      ratings->print(getDict().getUnicharset());
    }
    ratings->delete_matrix_pointers();
    delete ratings;
  }
  getDict().FilterWordChoices();
  char_choices->delete_data_pointers();
  delete char_choices;

  return best_char_choices;
}