Beispiel #1
0
/**
 * @name improve_by_chopping
 *
 * Start with the current word of blobs and its classification.  Find
 * the worst blobs and try to divide them up to improve the ratings.
 * As long as ratings are produced by the new blob splitting.  When
 * all the splitting has been accomplished all the ratings memory is
 * reclaimed.
 */
void Wordrec::improve_by_chopping(WERD_RES *word,
                                  BLOB_CHOICE_LIST_VECTOR *char_choices,
                                  STATE *best_state,
                                  BLOB_CHOICE_LIST_VECTOR *best_char_choices,
                                  DANGERR *fixpt,
                                  bool *best_choice_acceptable) {
  inT32 blob_number;
  float old_best;
  int fixpt_valid = 1;
  bool updated_best_choice = false;

  while (1) {  // improvement loop
    if (!fixpt_valid) fixpt->clear();
    old_best = word->best_choice->rating();
    if (improve_one_blob(word->chopped_word, char_choices,
                         &blob_number, &word->seam_array,
                         fixpt, (fragments_guide_chopper &&
                                 word->best_choice->fragment_mark()))) {
      getDict().LogNewSplit(blob_number);
      updated_best_choice =
        getDict().permute_characters(*char_choices, word->best_choice,
                                     word->raw_choice);

      if (old_best > word->best_choice->rating()) {
        set_n_ones(best_state, char_choices->length() - 1);
        fixpt_valid = 1;
      }
      else {
        insert_new_chunk(best_state, blob_number, char_choices->length() - 2);
        fixpt_valid = 0;
      }

      if (chop_debug)
        print_state("best state = ",
          best_state, count_blobs(word->chopped_word->blobs) - 1);
    } else {
      break;
    }

    // Check if we should break from the loop.
    bool done = false;
    bool replaced = false;
    if ((updated_best_choice &&
         (*best_choice_acceptable =
          getDict().AcceptableChoice(char_choices, word->best_choice,
                                     fixpt, CHOPPER_CALLER, &replaced))) ||
        char_choices->length() >= MAX_NUM_CHUNKS) {
      done = true;
    }
    if (replaced) update_blob_classifications(word->chopped_word,
                                              *char_choices);
    if (updated_best_choice) CopyCharChoices(*char_choices, best_char_choices);
    if (done) break;
  }
  if (!fixpt_valid) fixpt->clear();
}
Beispiel #2
0
/**
 * @name chop_one_blob
 *
 * Start with the current one-blob word and its classification.  Find
 * the worst blobs and try to divide it up to improve the ratings.
 * Used for testing chopper.
 */
SEAM* Wordrec::chop_one_blob(const GenericVector<TBOX>& boxes,
                             const GenericVector<BLOB_CHOICE*>& blob_choices,
                             WERD_RES* word_res,
                             int* blob_number) {
  if (prioritize_division) {
    return chop_overlapping_blob(boxes, true, word_res, blob_number);
  } else {
    return improve_one_blob(blob_choices, NULL, false, true, word_res,
                            blob_number);
  }
}
Beispiel #3
0
/**
 * @name improve_by_chopping
 *
 * Repeatedly chops the worst blob, classifying the new blobs fixing up all
 * the data, and incrementally runs the segmentation search until a good word
 * is found, or no more chops can be found.
 */
void Wordrec::improve_by_chopping(float rating_cert_scale,
                                  WERD_RES* word,
                                  BestChoiceBundle* best_choice_bundle,
                                  BlamerBundle* blamer_bundle,
                                  LMPainPoints* pain_points,
                                  GenericVector<SegSearchPending>* pending) {
  int blob_number;
  do {  // improvement loop.
    // Make a simple vector of BLOB_CHOICEs to make it easy to pick which
    // one to chop.
    GenericVector<BLOB_CHOICE*> blob_choices;
    int num_blobs = word->ratings->dimension();
    for (int i = 0; i < num_blobs; ++i) {
      BLOB_CHOICE_LIST* choices = word->ratings->get(i, i);
      if (choices == NULL || choices->empty()) {
        blob_choices.push_back(NULL);
      } else {
        BLOB_CHOICE_IT bc_it(choices);
        blob_choices.push_back(bc_it.data());
      }
    }
    SEAM* seam = improve_one_blob(blob_choices, &best_choice_bundle->fixpt,
                                  false, false, word, &blob_number);
    if (seam == NULL) break;
    // A chop has been made. We have to correct all the data structures to
    // take into account the extra bottom-level blob.
    // Put the seam into the seam_array and correct everything else on the
    // word: ratings matrix (including matrix location in the BLOB_CHOICES),
    // states in WERD_CHOICEs, and blob widths.
    word->InsertSeam(blob_number, seam);
    // Insert a new entry in the beam array.
    best_choice_bundle->beam.insert(new LanguageModelState, blob_number);
    // Fixpts are outdated, but will get recalculated.
    best_choice_bundle->fixpt.clear();
    // Remap existing pain points.
    pain_points->RemapForSplit(blob_number);
    // Insert a new pending at the chop point.
    pending->insert(SegSearchPending(), blob_number);

    // Classify the two newly created blobs using ProcessSegSearchPainPoint,
    // as that updates the pending correctly and adds new pain points.
    MATRIX_COORD pain_point(blob_number, blob_number);
    ProcessSegSearchPainPoint(0.0f, pain_point, "Chop1", pending, word,
                              pain_points, blamer_bundle);
    pain_point.col = blob_number + 1;
    pain_point.row = blob_number + 1;
    ProcessSegSearchPainPoint(0.0f, pain_point, "Chop2", pending, word,
                              pain_points, blamer_bundle);
    if (language_model_->language_model_ngram_on) {
      // N-gram evaluation depends on the number of blobs in a chunk, so we
      // have to re-evaluate everything in the word.
      ResetNGramSearch(word, best_choice_bundle, pending);
      blob_number = 0;
    }
    // Run language model incrementally. (Except with the n-gram model on.)
    UpdateSegSearchNodes(rating_cert_scale, blob_number, pending,
                         word, pain_points, best_choice_bundle, blamer_bundle);
  } while (!language_model_->AcceptableChoiceFound() &&
           word->ratings->dimension() < kMaxNumChunks);

  // If after running only the chopper best_choice is incorrect and no blame
  // has been yet set, blame the classifier if best_choice is classifier's
  // top choice and is a dictionary word (i.e. language model could not have
  // helped). Otherwise blame the tradeoff between the classifier and
  // the old language model (permuters).
  if (word->blamer_bundle != NULL &&
      word->blamer_bundle->incorrect_result_reason() == IRR_CORRECT &&
      !word->blamer_bundle->ChoiceIsCorrect(word->best_choice)) {
    bool valid_permuter = word->best_choice != NULL &&
        Dict::valid_word_permuter(word->best_choice->permuter(), false);
    word->blamer_bundle->BlameClassifierOrLangModel(word,
                                                    getDict().getUnicharset(),
                                                    valid_permuter,
                                                    wordrec_debug_blamer);
  }
}
/**********************************************************************
 * improve_by_chopping
 *
 * Start with the current word of blobs and its classification.  Find
 * the worst blobs and try to divide them up to improve the ratings.
 * As long as ratings are produced by the new blob splitting.  When
 * all the splitting has been accomplished all the ratings memory is
 * reclaimed.
 **********************************************************************/
void improve_by_chopping(register TWERD *word,
                         CHOICES_LIST *char_choices,
                         int fx,
                         STATE *best_state,
                         A_CHOICE *best_choice,
                         A_CHOICE *raw_choice,
                         SEAMS *seam_list,
                         DANGERR *fixpt,
                         STATE *chop_states,
                         INT32 *state_count,
                         STATE *correct_state,
                         INT32 pass) {
  INT32 blob_number;
  INT32 index;                   //to states
  CHOICES_LIST choices = *char_choices;
  float old_best;
  int fixpt_valid = 1;
  static INT32 old_count;        //from pass1

  do {
                                 /* Improvement loop */
    if (!fixpt_valid)
      fixpt->index = -1;
    old_best = class_probability (best_choice);
    choices = improve_one_blob (word, *char_choices, fx,
      &blob_number, seam_list, fixpt,
      chop_states + *state_count, correct_state,
      pass);
    if (choices != NULL) {
      LogNewSplit(blob_number);
      permute_characters (choices,
        class_probability (best_choice),
        best_choice, raw_choice);
      *char_choices = choices;

      if (old_best > class_probability (best_choice)) {
        set_n_ones (best_state, array_count (*char_choices) - 1);
        fixpt_valid = 1;
      }
      else {
        insert_new_chunk (best_state, blob_number,
          array_count (*char_choices) - 2);
        fixpt_valid = 0;
      }
      if (*state_count > 0) {
        if (pass == 0) {
          for (index = 0; index < *state_count; index++)
            insert_new_chunk (&chop_states[index], blob_number,
              array_count (*char_choices) - 2);
          set_n_ones (&chop_states[index],
            array_count (*char_choices) - 1);
        }
        (*state_count)++;
      }

      if (chop_debug)
        print_state ("best state = ",
          best_state, count_blobs (word->blobs) - 1);
      if (first_pass)
        chops_performed1++;
      else
        chops_performed2++;

    }
  }
  while (choices &&
    !AcceptableChoice (*char_choices, best_choice, raw_choice, fixpt) &&
    !blob_skip && array_count (*char_choices) < MAX_NUM_CHUNKS);
  if (pass == 0)
    old_count = *state_count;
  else {
    if (old_count != *state_count)
      fprintf (matcher_fp,
        "Mis-matched state counts, " INT32FORMAT " pass1, "
        INT32FORMAT " pass2\n", old_count, *state_count);
  }
  if (!fixpt_valid)
    fixpt->index = -1;
}