Beispiel #1
0
/**
 * @name chop_word_main
 *
 * Classify the blobs in this word and permute the results.  Find the
 * worst blob in the word and chop it up.  Continue this process until
 * a good answer has been found or all the blobs have been chopped up
 * enough.  Return the word level ratings.
 */
BLOB_CHOICE_LIST_VECTOR *Wordrec::chop_word_main(WERD_RES *word) {
  TBLOB *blob;
  int index;
  int did_chopping;
  STATE state;
  BLOB_CHOICE_LIST *match_result;
  MATRIX *ratings = NULL;
  DANGERR fixpt;                 /*dangerous ambig */
  inT32 bit_count;               //no of bits

  set_denorm(&word->denorm);

  BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
  BLOB_CHOICE_LIST_VECTOR *best_char_choices = new BLOB_CHOICE_LIST_VECTOR();

  did_chopping = 0;
  for (blob = word->chopped_word->blobs, index = 0;
       blob != NULL; blob = blob->next, index++) {
    match_result = classify_blob(blob, "chop_word:", Green);
    if (match_result == NULL)
      cprintf("Null classifier output!\n");
    *char_choices += match_result;
  }
  bit_count = index - 1;
  set_n_ones(&state, char_choices->length() - 1);
  bool acceptable = false;
  bool replaced = false;
  bool best_choice_updated =
    getDict().permute_characters(*char_choices, word->best_choice,
                                 word->raw_choice);
  if (best_choice_updated &&
      getDict().AcceptableChoice(char_choices, word->best_choice, &fixpt,
                                 CHOPPER_CALLER, &replaced)) {
    acceptable = true;
  }
  if (replaced)
    update_blob_classifications(word->chopped_word, *char_choices);
  CopyCharChoices(*char_choices, best_char_choices);
  if (!acceptable) {  // do more work to find a better choice
    did_chopping = 1;

    bool best_choice_acceptable = false;
    if (chop_enable)
      improve_by_chopping(word,
                          char_choices,
                          &state,
                          best_char_choices,
                          &fixpt,
                          &best_choice_acceptable);
    if (chop_debug)
      print_seams ("Final seam list:", word->seam_array);

    // The force_word_assoc is almost redundant to enable_assoc.  However,
    // it is not conditioned on the dict behavior.  For CJK, we need to force
    // the associator to be invoked.  When we figure out the exact behavior
    // of dict on CJK, we can remove the flag if it turns out to be redundant.
    if ((wordrec_enable_assoc && !best_choice_acceptable) || force_word_assoc) {
      ratings = word_associator(word, &state, best_char_choices,
                                &fixpt, &state);
    }
  }
  best_char_choices = rebuild_current_state(word, &state, best_char_choices,
                                            ratings);
  if (ratings != NULL) {
    if (wordrec_debug_level > 0) {
      tprintf("Final Ratings Matrix:\n");
      ratings->print(getDict().getUnicharset());
    }
    ratings->delete_matrix_pointers();
    delete ratings;
  }
  getDict().FilterWordChoices();
  char_choices->delete_data_pointers();
  delete char_choices;

  return best_char_choices;
}
/**********************************************************************
 * chop_word_main
 *
 * Classify the blobs in this word and permute the results.  Find the
 * worst blob in the word and chop it up.  Continue this process until
 * a good answer has been found or all the blobs have been chopped up
 * enough.  Return the word level ratings.
 **********************************************************************/
CHOICES_LIST chop_word_main(register TWERD *word,
                            int fx,
                            A_CHOICE *best_choice,
                            A_CHOICE *raw_choice,
                            BOOL8 tester,
                            BOOL8 trainer) {
  TBLOB *pblob;
  TBLOB *blob;
  CHOICES_LIST char_choices;
  int index;
  int did_chopping;
  float rating_limit = 1000.0;
  STATE state;
  SEAMS seam_list = NULL;
  CHOICES match_result;
  MATRIX ratings = NULL;
  DANGERR fixpt;                 /*dangerous ambig */
  INT32 state_count;             //no of states
  INT32 bit_count;               //no of bits
  static STATE best_state;
  static STATE chop_states[64];  //in between states

  state_count = 0;
  set_null_choice(best_choice);
  set_null_choice(raw_choice);

  char_choices = new_choice_list ();

  did_chopping = 0;
  for (blob = word->blobs, pblob = NULL, index = 0; blob != NULL;
  blob = blob->next, index++) {
    match_result =
      (CHOICES) classify_blob (pblob, blob, blob->next, NULL, fx,
      "chop_word:", Green, &chop_states[0],
      &best_state, matcher_pass, index);
    char_choices = array_push (char_choices, match_result);
    pblob = blob;
  }
  bit_count = index - 1;
  permute_characters(char_choices, rating_limit, best_choice, raw_choice);

  set_n_ones (&state, array_count (char_choices) - 1);
  if (matcher_fp != NULL) {
    if (matcher_pass == 0) {
      bits_in_states = bit_count;
      chop_states[state_count] = state;
    }
    state_count++;
  }

  if (!AcceptableChoice (char_choices, best_choice, raw_choice, &fixpt)
    || (tester || trainer)
  && strcmp (word->correct, class_string (best_choice))) {
    did_chopping = 1;
    if (first_pass)
      words_chopped1++;
    else
      words_chopped2++;

    seam_list = start_seam_list (word->blobs);

    if (chop_enable)
      improve_by_chopping(word,
                          &char_choices,
                          fx,
                          &state,
                          best_choice,
                          raw_choice,
                          &seam_list,
                          &fixpt,
                          chop_states,
                          &state_count,
                          &best_state,
                          matcher_pass);

    if (chop_debug)
      print_seams ("Final seam list:", seam_list);

    if (enable_assoc &&
      !AcceptableChoice (char_choices, best_choice, raw_choice, NULL)
      || (tester || trainer)
    && strcmp (word->correct, class_string (best_choice))) {
      ratings = word_associator (word->blobs, seam_list, &state, fx,
        best_choice, raw_choice, word->correct,
        /*0, */ &fixpt,
        &best_state, matcher_pass);
    }
    bits_in_states = bit_count + state_count - 1;

  }
  if (ratings != NULL)
    free_matrix(ratings);
  if (did_chopping || tester || trainer)
    char_choices = rebuild_current_state (word->blobs, seam_list, &state,
      char_choices, fx);
  if (seam_list != NULL)
    free_seam_list(seam_list);
  if (matcher_fp != NULL) {
    best_state = state;
  }
  FilterWordChoices();
  return char_choices;
}
Beispiel #3
0
void Wordrec::SegSearch(WERD_RES* word_res,
                        BestChoiceBundle* best_choice_bundle,
                        BlamerBundle* blamer_bundle) {
  LMPainPoints pain_points(segsearch_max_pain_points,
                           segsearch_max_char_wh_ratio,
                           assume_fixed_pitch_char_segment,
                           &getDict(), segsearch_debug_level);
  // Compute scaling factor that will help us recover blob outline length
  // from classifier rating and certainty for the blob.
  float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
  GenericVector<SegSearchPending> pending;
  InitialSegSearch(word_res, &pain_points, &pending, best_choice_bundle,
                   blamer_bundle);

  if (!SegSearchDone(0)) {  // find a better choice
    if (chop_enable && word_res->chopped_word != NULL) {
      improve_by_chopping(rating_cert_scale, word_res, best_choice_bundle,
                          blamer_bundle, &pain_points, &pending);
    }
    if (chop_debug) SEAM::PrintSeams("Final seam list:", word_res->seam_array);

    if (blamer_bundle != NULL &&
        !blamer_bundle->ChoiceIsCorrect(word_res->best_choice)) {
      blamer_bundle->SetChopperBlame(word_res, wordrec_debug_blamer);
    }
  }
  // Keep trying to find a better path by fixing the "pain points".

  MATRIX_COORD pain_point;
  float pain_point_priority;
  int num_futile_classifications = 0;
  STRING blamer_debug;
  while (wordrec_enable_assoc &&
      (!SegSearchDone(num_futile_classifications) ||
          (blamer_bundle != NULL &&
              blamer_bundle->GuidedSegsearchStillGoing()))) {
    // Get the next valid "pain point".
    bool found_nothing = true;
    LMPainPointsType pp_type;
    while ((pp_type = pain_points.Deque(&pain_point, &pain_point_priority)) !=
        LM_PPTYPE_NUM) {
      if (!pain_point.Valid(*word_res->ratings)) {
        word_res->ratings->IncreaseBandSize(
            pain_point.row - pain_point.col + 1);
      }
      if (pain_point.Valid(*word_res->ratings) &&
          !word_res->ratings->Classified(pain_point.col, pain_point.row,
                                         getDict().WildcardID())) {
        found_nothing = false;
        break;
      }
    }
    if (found_nothing) {
      if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n");
      break;
    }
    ProcessSegSearchPainPoint(pain_point_priority, pain_point,
                              LMPainPoints::PainPointDescription(pp_type),
                              &pending, word_res, &pain_points, blamer_bundle);

    UpdateSegSearchNodes(rating_cert_scale, pain_point.col, &pending,
                         word_res, &pain_points, best_choice_bundle,
                         blamer_bundle);
    if (!best_choice_bundle->updated) ++num_futile_classifications;

    if (segsearch_debug_level > 0) {
      tprintf("num_futile_classifications %d\n", num_futile_classifications);
    }

    best_choice_bundle->updated = false;  // reset updated

    // See if it's time to terminate SegSearch or time for starting a guided
    // search for the true path to find the blame for the incorrect best_choice.
    if (SegSearchDone(num_futile_classifications) &&
        blamer_bundle != NULL &&
        blamer_bundle->GuidedSegsearchNeeded(word_res->best_choice)) {
      InitBlamerForSegSearch(word_res, &pain_points, blamer_bundle,
                             &blamer_debug);
    }
  }  // end while loop exploring alternative paths
  if (blamer_bundle != NULL) {
    blamer_bundle->FinishSegSearch(word_res->best_choice,
                                   wordrec_debug_blamer, &blamer_debug);
  }

  if (segsearch_debug_level > 0) {
    tprintf("Done with SegSearch (AcceptableChoiceFound: %d)\n",
            language_model_->AcceptableChoiceFound());
  }
}