Exemple #1
0
void Wordrec::ProcessSegSearchPainPoint(
    float pain_point_priority,
    const MATRIX_COORD &pain_point, const char* pain_point_type,
    GenericVector<SegSearchPending>* pending, WERD_RES *word_res,
    LMPainPoints *pain_points, BlamerBundle *blamer_bundle) {
  if (segsearch_debug_level > 0) {
    tprintf("Classifying pain point %s priority=%.4f, col=%d, row=%d\n",
            pain_point_type, pain_point_priority,
            pain_point.col, pain_point.row);
  }
  ASSERT_HOST(pain_points != NULL);
  MATRIX *ratings = word_res->ratings;
  // Classify blob [pain_point.col pain_point.row]
  if (!pain_point.Valid(*ratings)) {
    ratings->IncreaseBandSize(pain_point.row + 1 - pain_point.col);
  }
  ASSERT_HOST(pain_point.Valid(*ratings));
  BLOB_CHOICE_LIST *classified = classify_piece(word_res->seam_array,
                                                pain_point.col, pain_point.row,
                                                pain_point_type,
                                                word_res->chopped_word,
                                                blamer_bundle);
  BLOB_CHOICE_LIST *lst = ratings->get(pain_point.col, pain_point.row);
  if (lst == NULL) {
    ratings->put(pain_point.col, pain_point.row, classified);
  } else {
    // We can not delete old BLOB_CHOICEs, since they might contain
    // ViterbiStateEntries that are parents of other "active" entries.
    // Thus if the matrix cell already contains classifications we add
    // the new ones to the beginning of the list.
    BLOB_CHOICE_IT it(lst);
    it.add_list_before(classified);
    delete classified;  // safe to delete, since empty after add_list_before()
    classified = NULL;
  }

  if (segsearch_debug_level > 0) {
    print_ratings_list("Updated ratings matrix with a new entry:",
                       ratings->get(pain_point.col, pain_point.row),
                       getDict().getUnicharset());
    ratings->print(getDict().getUnicharset());
  }

  // Insert initial "pain points" to join the newly classified blob
  // with its left and right neighbors.
  if (classified != NULL && !classified->empty()) {
    if (pain_point.col > 0) {
      pain_points->GeneratePainPoint(
          pain_point.col - 1, pain_point.row, LM_PPTYPE_SHAPE, 0.0,
          true, segsearch_max_char_wh_ratio, word_res);
    }
    if (pain_point.row + 1 < ratings->dimension()) {
      pain_points->GeneratePainPoint(
          pain_point.col, pain_point.row + 1, LM_PPTYPE_SHAPE, 0.0,
          true, segsearch_max_char_wh_ratio, word_res);
    }
  }
  (*pending)[pain_point.col].SetBlobClassified(pain_point.row);
}
void LMPainPoints::GenerateInitial(WERD_RES *word_res) {
  MATRIX *ratings = word_res->ratings;
  AssociateStats associate_stats;
  for (int col = 0; col < ratings->dimension(); ++col) {
    int row_end = MIN(ratings->dimension(), col + ratings->bandwidth() + 1);
    for (int row = col + 1; row < row_end; ++row) {
      MATRIX_COORD coord(col, row);
      if (coord.Valid(*ratings) &&
          ratings->get(col, row) != NOT_CLASSIFIED) continue;
      // Add an initial pain point if needed.
      if (ratings->Classified(col, row - 1, dict_->WildcardID()) ||
          (col + 1 < ratings->dimension() &&
              ratings->Classified(col + 1, row, dict_->WildcardID()))) {
        GeneratePainPoint(col, row, LM_PPTYPE_SHAPE, 0.0,
                          true, max_char_wh_ratio_, word_res);
      }
    }
  }
}
Exemple #3
0
void Wordrec::SegSearch(CHUNKS_RECORD *chunks_record,
                        WERD_CHOICE *best_choice,
                        BLOB_CHOICE_LIST_VECTOR *best_char_choices,
                        WERD_CHOICE *raw_choice,
                        STATE *output_best_state) {
  int row, col = 0;
  if (segsearch_debug_level > 0) {
    tprintf("Starting SegSearch on ratings matrix:\n");
    chunks_record->ratings->print(getDict().getUnicharset());
  }
  // Start with a fresh best_choice since rating adjustments
  // used by the chopper and the new segmentation search are not compatible.
  best_choice->set_rating(WERD_CHOICE::kBadRating);
  // Clear best choice accumulator (that is used for adaption), so that
  // choices adjusted by chopper do not interfere with the results from the
  // segmentation search.
  getDict().ClearBestChoiceAccum();

  MATRIX *ratings = chunks_record->ratings;
  // Priority queue containing pain points generated by the language model
  // The priority is set by the language model components, adjustments like
  // seam cost and width priority are factored into the priority.
  HEAP *pain_points = MakeHeap(segsearch_max_pain_points);

  // best_path_by_column records the lowest cost path found so far for each
  // column of the chunks_record->ratings matrix over all the rows.
  BestPathByColumn *best_path_by_column =
    new BestPathByColumn[ratings->dimension()];
  for (col = 0; col < ratings->dimension(); ++col) {
    best_path_by_column[col].avg_cost = WERD_CHOICE::kBadRating;
    best_path_by_column[col].best_vse = NULL;
  }

  language_model_->InitForWord(prev_word_best_choice_, &denorm_,
                               assume_fixed_pitch_char_segment,
                               best_choice->certainty(),
                               segsearch_max_char_wh_ratio,
                               pain_points, chunks_record);

  MATRIX_COORD *pain_point;
  float pain_point_priority;
  BestChoiceBundle best_choice_bundle(
      output_best_state, best_choice, raw_choice, best_char_choices);

  // pending[i] stores a list of the parent/child pair of BLOB_CHOICE_LISTs,
  // where i is the column of the child. Initially all the classified entries
  // in the ratings matrix from column 0 (with parent NULL) are inserted into
  // pending[0]. As the language model state is updated, new child/parent
  // pairs are inserted into the lists. Next, the entries in pending[1] are
  // considered, and so on. It is important that during the update the
  // children are considered in the non-decreasing order of their column, since
  // this guarantess that all the parents would be up to date before an update
  // of a child is done.
  SEG_SEARCH_PENDING_LIST *pending =
    new SEG_SEARCH_PENDING_LIST[ratings->dimension()];

  // Search for the ratings matrix for the initial best path.
  for (row = 0; row < ratings->dimension(); ++row) {
    if (ratings->get(0, row) != NOT_CLASSIFIED) {
      pending[0].add_sorted(
          SEG_SEARCH_PENDING::compare, true,
          new SEG_SEARCH_PENDING(row, NULL, LanguageModel::kAllChangedFlag));
    }
  }
  UpdateSegSearchNodes(0, &pending, &best_path_by_column, chunks_record,
                       pain_points, &best_choice_bundle);

  // Keep trying to find a better path by fixing the "pain points".
  int num_futile_classifications = 0;
  while (!(language_model_->AcceptableChoiceFound() ||
           num_futile_classifications >=
           segsearch_max_futile_classifications)) {
    // Get the next valid "pain point".
    int pop;
    while (true) {
      pop = HeapPop(pain_points, &pain_point_priority, &pain_point);
      if (pop == EMPTY) break;
      if (pain_point->Valid(*ratings) &&
        ratings->get(pain_point->col, pain_point->row) == NOT_CLASSIFIED) {
        break;
      } else {
        delete pain_point;
      }
    }
    if (pop == EMPTY) {
      if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n");
      break;
    }
    if (segsearch_debug_level > 0) {
      tprintf("Classifying pain point priority=%.4f, col=%d, row=%d\n",
              pain_point_priority, pain_point->col, pain_point->row);
    }
    BLOB_CHOICE_LIST *classified = classify_piece(
        chunks_record->chunks, chunks_record->splits,
        pain_point->col, pain_point->row);
    ratings->put(pain_point->col, pain_point->row, classified);

    if (segsearch_debug_level > 0) {
      print_ratings_list("Updated ratings matrix with a new entry:",
                         ratings->get(pain_point->col, pain_point->row),
                         getDict().getUnicharset());
      chunks_record->ratings->print(getDict().getUnicharset());
    }

    // Insert initial "pain points" to join the newly classified blob
    // with its left and right neighbors.
    if (!classified->empty()) {
      float worst_piece_cert;
      bool fragmented;
      if (pain_point->col > 0) {
        language_model_->GetWorstPieceCertainty(
            pain_point->col-1, pain_point->row, chunks_record->ratings,
            &worst_piece_cert, &fragmented);
        language_model_->GeneratePainPoint(
            pain_point->col-1, pain_point->row, false,
            LanguageModel::kInitialPainPointPriorityAdjustment,
            worst_piece_cert, fragmented, best_choice->certainty(),
            segsearch_max_char_wh_ratio, NULL, NULL,
            chunks_record, pain_points);
      }
      if (pain_point->row+1 < ratings->dimension()) {
        language_model_->GetWorstPieceCertainty(
            pain_point->col, pain_point->row+1, chunks_record->ratings,
            &worst_piece_cert, &fragmented);
        language_model_->GeneratePainPoint(
            pain_point->col, pain_point->row+1, true,
            LanguageModel::kInitialPainPointPriorityAdjustment,
            worst_piece_cert, fragmented, best_choice->certainty(),
            segsearch_max_char_wh_ratio, NULL, NULL,
            chunks_record, pain_points);
      }
    }

    // Record a pending entry with the pain_point and each of its parents.
    int parent_row = pain_point->col - 1;
    if (parent_row < 0) {  // this node has no parents
      pending[pain_point->col].add_sorted(
          SEG_SEARCH_PENDING::compare, true,
          new SEG_SEARCH_PENDING(pain_point->row, NULL,
                                 LanguageModel::kAllChangedFlag));
    } else {
      for (int parent_col = 0; parent_col < pain_point->col; ++parent_col) {
        if (ratings->get(parent_col, parent_row) != NOT_CLASSIFIED) {
          pending[pain_point->col].add_sorted(
              SEG_SEARCH_PENDING::compare, true,
              new SEG_SEARCH_PENDING(pain_point->row,
                                     ratings->get(parent_col, parent_row),
                                     LanguageModel::kAllChangedFlag));
        }
      }
    }
    UpdateSegSearchNodes(pain_point->col, &pending, &best_path_by_column,
                         chunks_record, pain_points, &best_choice_bundle);
    if (!best_choice_bundle.updated) ++num_futile_classifications;

    if (segsearch_debug_level > 0) {
      tprintf("num_futile_classifications %d\n", num_futile_classifications);
    }

    // Clean up
    best_choice_bundle.updated = false;
    delete pain_point;  // done using this pain point
  }

  if (segsearch_debug_level > 0) {
    tprintf("Done with SegSearch (AcceptableChoiceFound: %d\n",
            language_model_->AcceptableChoiceFound());
  }

  // Clean up.
  FreeHeapData(pain_points, MATRIX_COORD::Delete);
  delete[] best_path_by_column;
  delete[] pending;
  for (row = 0; row < ratings->dimension(); ++row) {
    for (col = 0; col <= row; ++col) {
      BLOB_CHOICE_LIST *rating = ratings->get(col, row);
      if (rating != NOT_CLASSIFIED) language_model_->DeleteState(rating);
    }
  }
}
Exemple #4
0
void Wordrec::UpdateSegSearchNodes(
    int starting_col,
    SEG_SEARCH_PENDING_LIST *pending[],
    BestPathByColumn *best_path_by_column[],
    CHUNKS_RECORD *chunks_record,
    HEAP *pain_points,
    BestChoiceBundle *best_choice_bundle) {
  MATRIX *ratings = chunks_record->ratings;
  for (int col = starting_col; col < ratings->dimension(); ++col) {
    if (segsearch_debug_level > 0) {
      tprintf("\n\nUpdateSegSearchNodes: evaluate children in col=%d\n", col);
    }
    // Iterate over the pending list for this column.
    SEG_SEARCH_PENDING_LIST *pending_list = &((*pending)[col]);
    SEG_SEARCH_PENDING_IT pending_it(pending_list);
    GenericVector<int> non_empty_rows;
    while (!pending_it.empty()) {
      // Update language model state of this child+parent pair.
      SEG_SEARCH_PENDING *p = pending_it.extract();
      if (non_empty_rows.length() == 0 ||
          non_empty_rows[non_empty_rows.length()-1] != p->child_row) {
        non_empty_rows.push_back(p->child_row);
      }
      BLOB_CHOICE_LIST *current_node = ratings->get(col, p->child_row);
      LanguageModelFlagsType new_changed =
        language_model_->UpdateState(p->changed, col, p->child_row,
                                     current_node, p->parent, pain_points,
                                     best_path_by_column,
                                     chunks_record, best_choice_bundle);
      if (new_changed) {
        // Since the language model state of this entry changed, add all the
        // pairs with it as a parent and each of its children to pending, so
        // that the children are updated as well.
        int child_col = p->child_row + 1;
        for (int child_row = child_col;
             child_row < ratings->dimension(); ++child_row) {
          if (ratings->get(child_col, child_row) != NOT_CLASSIFIED) {
            SEG_SEARCH_PENDING *new_pending =
              new SEG_SEARCH_PENDING(child_row, current_node, 0);
            SEG_SEARCH_PENDING *actual_new_pending =
              reinterpret_cast<SEG_SEARCH_PENDING *>(
                  (*pending)[child_col].add_sorted_and_find(
                  SEG_SEARCH_PENDING::compare, true, new_pending));
            if (new_pending != actual_new_pending) delete new_pending;
            actual_new_pending->changed |= new_changed;
            if (segsearch_debug_level > 0) {
                  tprintf("Added child(col=%d row=%d) parent(col=%d row=%d)"
                          " changed=0x%x to pending\n", child_col,
                          actual_new_pending->child_row,
                          col, p->child_row, actual_new_pending->changed);
            }
          }
        }
      }  // end if new_changed
      delete p;  // clean up
      pending_it.forward();
    } // end while !pending_it.empty()
    language_model_->GeneratePainPointsFromColumn(
      col, non_empty_rows, best_choice_bundle->best_choice->certainty(),
      pain_points, best_path_by_column, chunks_record);
  }  // end for col

  if (best_choice_bundle->updated) {
    language_model_->GeneratePainPointsFromBestChoice(
        pain_points, chunks_record, best_choice_bundle);
  }

  language_model_->CleanUp();
}
Exemple #5
0
void Wordrec::UpdateSegSearchNodes(
    float rating_cert_scale,
    int starting_col,
    GenericVector<SegSearchPending>* pending,
    WERD_RES *word_res,
    LMPainPoints *pain_points,
    BestChoiceBundle *best_choice_bundle,
    BlamerBundle *blamer_bundle) {
  MATRIX *ratings = word_res->ratings;
  ASSERT_HOST(ratings->dimension() == pending->size());
  ASSERT_HOST(ratings->dimension() == best_choice_bundle->beam.size());
  for (int col = starting_col; col < ratings->dimension(); ++col) {
    if (!(*pending)[col].WorkToDo()) continue;
    int first_row = col;
    int last_row = MIN(ratings->dimension() - 1,
                       col + ratings->bandwidth() - 1);
    if ((*pending)[col].SingleRow() >= 0) {
      first_row = last_row = (*pending)[col].SingleRow();
    }
    if (segsearch_debug_level > 0) {
      tprintf("\n\nUpdateSegSearchNodes: col=%d, rows=[%d,%d], alljust=%d\n",
              col, first_row, last_row,
              (*pending)[col].IsRowJustClassified(INT32_MAX));
    }
    // Iterate over the pending list for this column.
    for (int row = first_row; row <= last_row; ++row) {
      // Update language model state of this child+parent pair.
      BLOB_CHOICE_LIST *current_node = ratings->get(col, row);
      LanguageModelState *parent_node =
          col == 0 ? NULL : best_choice_bundle->beam[col - 1];
      if (current_node != NULL &&
          language_model_->UpdateState((*pending)[col].IsRowJustClassified(row),
                                       col, row, current_node, parent_node,
                                       pain_points, word_res,
                                       best_choice_bundle, blamer_bundle) &&
          row + 1 < ratings->dimension()) {
        // Since the language model state of this entry changed, process all
        // the child column.
        (*pending)[row + 1].RevisitWholeColumn();
        if (segsearch_debug_level > 0) {
          tprintf("Added child col=%d to pending\n", row + 1);
        }
      }  // end if UpdateState.
    }  // end for row.
  }  // end for col.
  if (best_choice_bundle->best_vse != NULL) {
    ASSERT_HOST(word_res->StatesAllValid());
    if (best_choice_bundle->best_vse->updated) {
      pain_points->GenerateFromPath(rating_cert_scale,
                                    best_choice_bundle->best_vse, word_res);
      if (!best_choice_bundle->fixpt.empty()) {
        pain_points->GenerateFromAmbigs(best_choice_bundle->fixpt,
                                        best_choice_bundle->best_vse, word_res);
      }
    }
  }
  // The segsearch is completed. Reset all updated flags on all VSEs and reset
  // all pendings.
  for (int col = 0; col < pending->size(); ++col) {
    (*pending)[col].Clear();
    ViterbiStateEntry_IT
        vse_it(&best_choice_bundle->beam[col]->viterbi_state_entries);
    for (vse_it.mark_cycle_pt(); !vse_it.cycled_list(); vse_it.forward()) {
      vse_it.data()->updated = false;
    }
  }
}