Example #1
0
// Setup and run just the initial segsearch on an established matrix,
// without doing any additional chopping or joining.
// (Internal factored version that can be used as part of the main SegSearch.)
void Wordrec::InitialSegSearch(WERD_RES* word_res, LMPainPoints* pain_points,
                               GenericVector<SegSearchPending>* pending,
                               BestChoiceBundle* best_choice_bundle,
                               BlamerBundle* blamer_bundle) {
  if (segsearch_debug_level > 0) {
    tprintf("Starting SegSearch on ratings matrix%s:\n",
            wordrec_enable_assoc ? " (with assoc)" : "");
    word_res->ratings->print(getDict().getUnicharset());
  }

  pain_points->GenerateInitial(word_res);

  // Compute scaling factor that will help us recover blob outline length
  // from classifier rating and certainty for the blob.
  float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;

  language_model_->InitForWord(prev_word_best_choice_,
                               assume_fixed_pitch_char_segment,
                               segsearch_max_char_wh_ratio, rating_cert_scale);

  // Initialize blamer-related information: map character boxes recorded in
  // blamer_bundle->norm_truth_word to the corresponding i,j indices in the
  // ratings matrix. We expect this step to succeed, since when running the
  // chopper we checked that the correct chops are present.
  if (blamer_bundle != NULL) {
    blamer_bundle->SetupCorrectSegmentation(word_res->chopped_word,
                                            wordrec_debug_blamer);
  }

  // pending[col] tells whether there is update work to do to combine
  // best_choice_bundle->beam[col - 1] with some BLOB_CHOICEs in matrix[col, *].
  // As the language model state is updated, pending entries are modified to
  // minimize duplication of work. It is important that during the update the
  // children are considered in the non-decreasing order of their column, since
  // this guarantees that all the parents would be up to date before an update
  // of a child is done.
  pending->init_to_size(word_res->ratings->dimension(), SegSearchPending());

  // Search the ratings matrix for the initial best path.
  (*pending)[0].SetColumnClassified();
  UpdateSegSearchNodes(rating_cert_scale, 0, pending, word_res,
                       pain_points, best_choice_bundle, blamer_bundle);
}
Example #2
0
/**
 * @name improve_by_chopping
 *
 * Repeatedly chops the worst blob, classifying the new blobs fixing up all
 * the data, and incrementally runs the segmentation search until a good word
 * is found, or no more chops can be found.
 */
void Wordrec::improve_by_chopping(float rating_cert_scale,
                                  WERD_RES* word,
                                  BestChoiceBundle* best_choice_bundle,
                                  BlamerBundle* blamer_bundle,
                                  LMPainPoints* pain_points,
                                  GenericVector<SegSearchPending>* pending) {
  int blob_number;
  do {  // improvement loop.
    // Make a simple vector of BLOB_CHOICEs to make it easy to pick which
    // one to chop.
    GenericVector<BLOB_CHOICE*> blob_choices;
    int num_blobs = word->ratings->dimension();
    for (int i = 0; i < num_blobs; ++i) {
      BLOB_CHOICE_LIST* choices = word->ratings->get(i, i);
      if (choices == NULL || choices->empty()) {
        blob_choices.push_back(NULL);
      } else {
        BLOB_CHOICE_IT bc_it(choices);
        blob_choices.push_back(bc_it.data());
      }
    }
    SEAM* seam = improve_one_blob(blob_choices, &best_choice_bundle->fixpt,
                                  false, false, word, &blob_number);
    if (seam == NULL) break;
    // A chop has been made. We have to correct all the data structures to
    // take into account the extra bottom-level blob.
    // Put the seam into the seam_array and correct everything else on the
    // word: ratings matrix (including matrix location in the BLOB_CHOICES),
    // states in WERD_CHOICEs, and blob widths.
    word->InsertSeam(blob_number, seam);
    // Insert a new entry in the beam array.
    best_choice_bundle->beam.insert(new LanguageModelState, blob_number);
    // Fixpts are outdated, but will get recalculated.
    best_choice_bundle->fixpt.clear();
    // Remap existing pain points.
    pain_points->RemapForSplit(blob_number);
    // Insert a new pending at the chop point.
    pending->insert(SegSearchPending(), blob_number);

    // Classify the two newly created blobs using ProcessSegSearchPainPoint,
    // as that updates the pending correctly and adds new pain points.
    MATRIX_COORD pain_point(blob_number, blob_number);
    ProcessSegSearchPainPoint(0.0f, pain_point, "Chop1", pending, word,
                              pain_points, blamer_bundle);
    pain_point.col = blob_number + 1;
    pain_point.row = blob_number + 1;
    ProcessSegSearchPainPoint(0.0f, pain_point, "Chop2", pending, word,
                              pain_points, blamer_bundle);
    if (language_model_->language_model_ngram_on) {
      // N-gram evaluation depends on the number of blobs in a chunk, so we
      // have to re-evaluate everything in the word.
      ResetNGramSearch(word, best_choice_bundle, pending);
      blob_number = 0;
    }
    // Run language model incrementally. (Except with the n-gram model on.)
    UpdateSegSearchNodes(rating_cert_scale, blob_number, pending,
                         word, pain_points, best_choice_bundle, blamer_bundle);
  } while (!language_model_->AcceptableChoiceFound() &&
           word->ratings->dimension() < kMaxNumChunks);

  // If after running only the chopper best_choice is incorrect and no blame
  // has been yet set, blame the classifier if best_choice is classifier's
  // top choice and is a dictionary word (i.e. language model could not have
  // helped). Otherwise blame the tradeoff between the classifier and
  // the old language model (permuters).
  if (word->blamer_bundle != NULL &&
      word->blamer_bundle->incorrect_result_reason() == IRR_CORRECT &&
      !word->blamer_bundle->ChoiceIsCorrect(word->best_choice)) {
    bool valid_permuter = word->best_choice != NULL &&
        Dict::valid_word_permuter(word->best_choice->permuter(), false);
    word->blamer_bundle->BlameClassifierOrLangModel(word,
                                                    getDict().getUnicharset(),
                                                    valid_permuter,
                                                    wordrec_debug_blamer);
  }
}
Example #3
0
void Wordrec::SegSearch(CHUNKS_RECORD *chunks_record,
                        WERD_CHOICE *best_choice,
                        BLOB_CHOICE_LIST_VECTOR *best_char_choices,
                        WERD_CHOICE *raw_choice,
                        STATE *output_best_state) {
  int row, col = 0;
  if (segsearch_debug_level > 0) {
    tprintf("Starting SegSearch on ratings matrix:\n");
    chunks_record->ratings->print(getDict().getUnicharset());
  }
  // Start with a fresh best_choice since rating adjustments
  // used by the chopper and the new segmentation search are not compatible.
  best_choice->set_rating(WERD_CHOICE::kBadRating);
  // Clear best choice accumulator (that is used for adaption), so that
  // choices adjusted by chopper do not interfere with the results from the
  // segmentation search.
  getDict().ClearBestChoiceAccum();

  MATRIX *ratings = chunks_record->ratings;
  // Priority queue containing pain points generated by the language model
  // The priority is set by the language model components, adjustments like
  // seam cost and width priority are factored into the priority.
  HEAP *pain_points = MakeHeap(segsearch_max_pain_points);

  // best_path_by_column records the lowest cost path found so far for each
  // column of the chunks_record->ratings matrix over all the rows.
  BestPathByColumn *best_path_by_column =
    new BestPathByColumn[ratings->dimension()];
  for (col = 0; col < ratings->dimension(); ++col) {
    best_path_by_column[col].avg_cost = WERD_CHOICE::kBadRating;
    best_path_by_column[col].best_vse = NULL;
  }

  language_model_->InitForWord(prev_word_best_choice_, &denorm_,
                               assume_fixed_pitch_char_segment,
                               best_choice->certainty(),
                               segsearch_max_char_wh_ratio,
                               pain_points, chunks_record);

  MATRIX_COORD *pain_point;
  float pain_point_priority;
  BestChoiceBundle best_choice_bundle(
      output_best_state, best_choice, raw_choice, best_char_choices);

  // pending[i] stores a list of the parent/child pair of BLOB_CHOICE_LISTs,
  // where i is the column of the child. Initially all the classified entries
  // in the ratings matrix from column 0 (with parent NULL) are inserted into
  // pending[0]. As the language model state is updated, new child/parent
  // pairs are inserted into the lists. Next, the entries in pending[1] are
  // considered, and so on. It is important that during the update the
  // children are considered in the non-decreasing order of their column, since
  // this guarantess that all the parents would be up to date before an update
  // of a child is done.
  SEG_SEARCH_PENDING_LIST *pending =
    new SEG_SEARCH_PENDING_LIST[ratings->dimension()];

  // Search for the ratings matrix for the initial best path.
  for (row = 0; row < ratings->dimension(); ++row) {
    if (ratings->get(0, row) != NOT_CLASSIFIED) {
      pending[0].add_sorted(
          SEG_SEARCH_PENDING::compare, true,
          new SEG_SEARCH_PENDING(row, NULL, LanguageModel::kAllChangedFlag));
    }
  }
  UpdateSegSearchNodes(0, &pending, &best_path_by_column, chunks_record,
                       pain_points, &best_choice_bundle);

  // Keep trying to find a better path by fixing the "pain points".
  int num_futile_classifications = 0;
  while (!(language_model_->AcceptableChoiceFound() ||
           num_futile_classifications >=
           segsearch_max_futile_classifications)) {
    // Get the next valid "pain point".
    int pop;
    while (true) {
      pop = HeapPop(pain_points, &pain_point_priority, &pain_point);
      if (pop == EMPTY) break;
      if (pain_point->Valid(*ratings) &&
        ratings->get(pain_point->col, pain_point->row) == NOT_CLASSIFIED) {
        break;
      } else {
        delete pain_point;
      }
    }
    if (pop == EMPTY) {
      if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n");
      break;
    }
    if (segsearch_debug_level > 0) {
      tprintf("Classifying pain point priority=%.4f, col=%d, row=%d\n",
              pain_point_priority, pain_point->col, pain_point->row);
    }
    BLOB_CHOICE_LIST *classified = classify_piece(
        chunks_record->chunks, chunks_record->splits,
        pain_point->col, pain_point->row);
    ratings->put(pain_point->col, pain_point->row, classified);

    if (segsearch_debug_level > 0) {
      print_ratings_list("Updated ratings matrix with a new entry:",
                         ratings->get(pain_point->col, pain_point->row),
                         getDict().getUnicharset());
      chunks_record->ratings->print(getDict().getUnicharset());
    }

    // Insert initial "pain points" to join the newly classified blob
    // with its left and right neighbors.
    if (!classified->empty()) {
      float worst_piece_cert;
      bool fragmented;
      if (pain_point->col > 0) {
        language_model_->GetWorstPieceCertainty(
            pain_point->col-1, pain_point->row, chunks_record->ratings,
            &worst_piece_cert, &fragmented);
        language_model_->GeneratePainPoint(
            pain_point->col-1, pain_point->row, false,
            LanguageModel::kInitialPainPointPriorityAdjustment,
            worst_piece_cert, fragmented, best_choice->certainty(),
            segsearch_max_char_wh_ratio, NULL, NULL,
            chunks_record, pain_points);
      }
      if (pain_point->row+1 < ratings->dimension()) {
        language_model_->GetWorstPieceCertainty(
            pain_point->col, pain_point->row+1, chunks_record->ratings,
            &worst_piece_cert, &fragmented);
        language_model_->GeneratePainPoint(
            pain_point->col, pain_point->row+1, true,
            LanguageModel::kInitialPainPointPriorityAdjustment,
            worst_piece_cert, fragmented, best_choice->certainty(),
            segsearch_max_char_wh_ratio, NULL, NULL,
            chunks_record, pain_points);
      }
    }

    // Record a pending entry with the pain_point and each of its parents.
    int parent_row = pain_point->col - 1;
    if (parent_row < 0) {  // this node has no parents
      pending[pain_point->col].add_sorted(
          SEG_SEARCH_PENDING::compare, true,
          new SEG_SEARCH_PENDING(pain_point->row, NULL,
                                 LanguageModel::kAllChangedFlag));
    } else {
      for (int parent_col = 0; parent_col < pain_point->col; ++parent_col) {
        if (ratings->get(parent_col, parent_row) != NOT_CLASSIFIED) {
          pending[pain_point->col].add_sorted(
              SEG_SEARCH_PENDING::compare, true,
              new SEG_SEARCH_PENDING(pain_point->row,
                                     ratings->get(parent_col, parent_row),
                                     LanguageModel::kAllChangedFlag));
        }
      }
    }
    UpdateSegSearchNodes(pain_point->col, &pending, &best_path_by_column,
                         chunks_record, pain_points, &best_choice_bundle);
    if (!best_choice_bundle.updated) ++num_futile_classifications;

    if (segsearch_debug_level > 0) {
      tprintf("num_futile_classifications %d\n", num_futile_classifications);
    }

    // Clean up
    best_choice_bundle.updated = false;
    delete pain_point;  // done using this pain point
  }

  if (segsearch_debug_level > 0) {
    tprintf("Done with SegSearch (AcceptableChoiceFound: %d\n",
            language_model_->AcceptableChoiceFound());
  }

  // Clean up.
  FreeHeapData(pain_points, MATRIX_COORD::Delete);
  delete[] best_path_by_column;
  delete[] pending;
  for (row = 0; row < ratings->dimension(); ++row) {
    for (col = 0; col <= row; ++col) {
      BLOB_CHOICE_LIST *rating = ratings->get(col, row);
      if (rating != NOT_CLASSIFIED) language_model_->DeleteState(rating);
    }
  }
}
Example #4
0
void Wordrec::SegSearch(WERD_RES* word_res,
                        BestChoiceBundle* best_choice_bundle,
                        BlamerBundle* blamer_bundle) {
  LMPainPoints pain_points(segsearch_max_pain_points,
                           segsearch_max_char_wh_ratio,
                           assume_fixed_pitch_char_segment,
                           &getDict(), segsearch_debug_level);
  // Compute scaling factor that will help us recover blob outline length
  // from classifier rating and certainty for the blob.
  float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
  GenericVector<SegSearchPending> pending;
  InitialSegSearch(word_res, &pain_points, &pending, best_choice_bundle,
                   blamer_bundle);

  if (!SegSearchDone(0)) {  // find a better choice
    if (chop_enable && word_res->chopped_word != NULL) {
      improve_by_chopping(rating_cert_scale, word_res, best_choice_bundle,
                          blamer_bundle, &pain_points, &pending);
    }
    if (chop_debug) SEAM::PrintSeams("Final seam list:", word_res->seam_array);

    if (blamer_bundle != NULL &&
        !blamer_bundle->ChoiceIsCorrect(word_res->best_choice)) {
      blamer_bundle->SetChopperBlame(word_res, wordrec_debug_blamer);
    }
  }
  // Keep trying to find a better path by fixing the "pain points".

  MATRIX_COORD pain_point;
  float pain_point_priority;
  int num_futile_classifications = 0;
  STRING blamer_debug;
  while (wordrec_enable_assoc &&
      (!SegSearchDone(num_futile_classifications) ||
          (blamer_bundle != NULL &&
              blamer_bundle->GuidedSegsearchStillGoing()))) {
    // Get the next valid "pain point".
    bool found_nothing = true;
    LMPainPointsType pp_type;
    while ((pp_type = pain_points.Deque(&pain_point, &pain_point_priority)) !=
        LM_PPTYPE_NUM) {
      if (!pain_point.Valid(*word_res->ratings)) {
        word_res->ratings->IncreaseBandSize(
            pain_point.row - pain_point.col + 1);
      }
      if (pain_point.Valid(*word_res->ratings) &&
          !word_res->ratings->Classified(pain_point.col, pain_point.row,
                                         getDict().WildcardID())) {
        found_nothing = false;
        break;
      }
    }
    if (found_nothing) {
      if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n");
      break;
    }
    ProcessSegSearchPainPoint(pain_point_priority, pain_point,
                              LMPainPoints::PainPointDescription(pp_type),
                              &pending, word_res, &pain_points, blamer_bundle);

    UpdateSegSearchNodes(rating_cert_scale, pain_point.col, &pending,
                         word_res, &pain_points, best_choice_bundle,
                         blamer_bundle);
    if (!best_choice_bundle->updated) ++num_futile_classifications;

    if (segsearch_debug_level > 0) {
      tprintf("num_futile_classifications %d\n", num_futile_classifications);
    }

    best_choice_bundle->updated = false;  // reset updated

    // See if it's time to terminate SegSearch or time for starting a guided
    // search for the true path to find the blame for the incorrect best_choice.
    if (SegSearchDone(num_futile_classifications) &&
        blamer_bundle != NULL &&
        blamer_bundle->GuidedSegsearchNeeded(word_res->best_choice)) {
      InitBlamerForSegSearch(word_res, &pain_points, blamer_bundle,
                             &blamer_debug);
    }
  }  // end while loop exploring alternative paths
  if (blamer_bundle != NULL) {
    blamer_bundle->FinishSegSearch(word_res->best_choice,
                                   wordrec_debug_blamer, &blamer_debug);
  }

  if (segsearch_debug_level > 0) {
    tprintf("Done with SegSearch (AcceptableChoiceFound: %d)\n",
            language_model_->AcceptableChoiceFound());
  }
}