// Decode the label marginals for each candidate arc. The output vector
// total_scores contains the sum of exp-scores (over the labels) for each arc;
// label_marginals contains those marginals ignoring the tree constraint.
void ConstituencyLabelerDecoder::DecodeLabelMarginals(
  Instance *instance, Parts *parts,
  const std::vector<double> &scores,
  std::vector<double> *total_scores,
  std::vector<double> *label_marginals) {
  ConstituencyLabelerInstanceNumeric *sentence =
    static_cast<ConstituencyLabelerInstanceNumeric*>(instance);
  ConstituencyLabelerParts *labeled_parts =
    static_cast<ConstituencyLabelerParts*>(parts);
  ConstituencyLabelerOptions *labeler_options =
    static_cast<ConstituencyLabelerOptions*>(pipe_->GetOptions());
  int num_nodes = sentence->GetNumConstituents();

  int offset_labeled_nodes, num_labeled_nodes;
  labeled_parts->GetOffsetNode(&offset_labeled_nodes, &num_labeled_nodes);
  total_scores->clear();
  total_scores->resize(num_nodes, 0.0);
  label_marginals->clear();
  label_marginals->resize(num_labeled_nodes, 0.0);

  for (int i = 0; i < num_nodes; ++i) {
    const std::vector<int> &index_node_parts =
      labeled_parts->FindNodeParts(i);
    // If no part for null label, initiliaze log partition to exp(0.0) to
    // account the null label which has score 0.0.
    LogValD total_score = (labeler_options->ignore_null_labels()) ?
      LogValD::One() : LogValD::Zero();
    for (int k = 0; k < index_node_parts.size(); ++k) {
      total_score += LogValD(scores[index_node_parts[k]], false);
    }
    (*total_scores)[i] = total_score.logabs();
    // If no part for null label, initiliaze sum to exp(0.0)/Z to
    // account the null label which has score 0.0.
    double sum = (labeler_options->ignore_null_labels()) ?
      (1.0 / total_score.as_float()) : 0.0;
    for (int k = 0; k < index_node_parts.size(); ++k) {
      LogValD marginal =
        LogValD(scores[index_node_parts[k]], false) / total_score;
      (*label_marginals)[index_node_parts[k] - offset_labeled_nodes] =
        marginal.as_float();
      sum += marginal.as_float();
    }
    if (!NEARLY_EQ_TOL(sum, 1.0, 1e-9)) {
      LOG(INFO) << "Label marginals don't sum to one: sum = " << sum;
    }
  }
}
Exemplo n.º 2
0
void Pipe::TrainEpoch(int epoch) {
  Instance *instance;
  Parts *parts = CreateParts();
  Features *features = CreateFeatures();
  vector<double> scores;
  vector<double> gold_outputs;
  vector<double> predicted_outputs;
  double total_cost = 0.0;
  double total_loss = 0.0;
  double eta;
  int num_instances = instances_.size();
  double lambda = 1.0/(options_->GetRegularizationConstant() *
                       (static_cast<double>(num_instances)));
  timeval start, end;
  gettimeofday(&start, NULL);
  int time_decoding = 0;
  int time_scores = 0;
  int num_mistakes = 0;

  LOG(INFO) << " Iteration #" << epoch + 1;

  dictionary_->StopGrowth();

  for (int i = 0; i < instances_.size(); i++) {
    int t = num_instances * epoch + i;
    instance = instances_[i];
    MakeParts(instance, parts, &gold_outputs);
    MakeFeatures(instance, parts, features);

    // If using only supported features, must remove the unsupported ones.
    // This is necessary not to mess up the computation of the squared norm
    // of the feature difference vector in MIRA.
    if (options_->only_supported_features()) {
      RemoveUnsupportedFeatures(instance, parts, features);
    }

    timeval start_scores, end_scores;
    gettimeofday(&start_scores, NULL);
    ComputeScores(instance, parts, features, &scores);
    gettimeofday(&end_scores, NULL);
    time_scores += diff_ms(end_scores, start_scores);

    if (options_->GetTrainingAlgorithm() == "perceptron" ||
        options_->GetTrainingAlgorithm() == "mira" ) {
      timeval start_decoding, end_decoding;
      gettimeofday(&start_decoding, NULL);
      decoder_->Decode(instance, parts, scores, &predicted_outputs);
      gettimeofday(&end_decoding, NULL);
      time_decoding += diff_ms(end_decoding, start_decoding);

      if (options_->GetTrainingAlgorithm() == "perceptron") {
        for (int r = 0; r < parts->size(); ++r) {
          if (!NEARLY_EQ_TOL(gold_outputs[r], predicted_outputs[r], 1e-6)) {
            ++num_mistakes;
          }
        }
        eta = 1.0;
      } else {
        CHECK(false) << "Plain mira is not implemented yet.";
      }

      MakeGradientStep(parts, features, eta, t, gold_outputs,
                       predicted_outputs);

    } else if (options_->GetTrainingAlgorithm() == "svm_mira" ||
               options_->GetTrainingAlgorithm() == "crf_mira" ||
               options_->GetTrainingAlgorithm() == "svm_sgd" ||
               options_->GetTrainingAlgorithm() == "crf_sgd") {
      double loss;
      timeval start_decoding, end_decoding;
      gettimeofday(&start_decoding, NULL);
      if (options_->GetTrainingAlgorithm() == "svm_mira" ||
          options_->GetTrainingAlgorithm() == "svm_sgd") {
        // Do cost-augmented inference.
        double cost;
        decoder_->DecodeCostAugmented(instance, parts, scores, gold_outputs,
                                      &predicted_outputs, &cost, &loss);
        total_cost += cost;
      } else {
        // Do marginal inference.
        double entropy;
        decoder_->DecodeMarginals(instance, parts, scores, gold_outputs,
                                  &predicted_outputs, &entropy, &loss);
        CHECK_GE(entropy, 0.0);
      }
      gettimeofday(&end_decoding, NULL);
      time_decoding += diff_ms(end_decoding, start_decoding);

      if (loss < 0.0) {
        if (!NEARLY_EQ_TOL(loss, 0.0, 1e-9)) {
          LOG(INFO) << "Warning: negative loss set to zero: " << loss;
        }
        loss = 0.0;
      }
      total_loss += loss;

      // Compute difference between predicted and gold feature vectors.
      FeatureVector difference;
      MakeFeatureDifference(parts, features, gold_outputs, predicted_outputs,
                            &difference);

      // Get the stepsize.
      if (options_->GetTrainingAlgorithm() == "svm_mira" ||
          options_->GetTrainingAlgorithm() == "crf_mira") {
        double squared_norm = difference.GetSquaredNorm();
        double threshold = 1e-9;
        if (loss < threshold || squared_norm < threshold) {
          eta = 0.0;
        } else {
          eta = loss / squared_norm;
          if (eta > options_->GetRegularizationConstant()) {
            eta = options_->GetRegularizationConstant();
          }
        }
      } else {
        if (options_->GetLearningRateSchedule() == "fixed") {
          eta = options_->GetInitialLearningRate();
        } else if (options_->GetLearningRateSchedule() == "invsqrt") {
          eta = options_->GetInitialLearningRate() /
            sqrt(static_cast<double>(t+1));
        } else if (options_->GetLearningRateSchedule() == "inv") {
          eta = options_->GetInitialLearningRate() /
            static_cast<double>(t+1);
        } else if (options_->GetLearningRateSchedule() == "lecun") {
          eta = options_->GetInitialLearningRate() /
            (1.0 + (static_cast<double>(t) / static_cast<double>(num_instances)));
        } else {
          CHECK(false) << "Unknown learning rate schedule: "
                       << options_->GetLearningRateSchedule();
        }

        // Scale the parameter vector (only for SGD).
        double decay = 1 - eta * lambda;
        CHECK_GT(decay, 0.0);
        parameters_->Scale(decay);
      }

      MakeGradientStep(parts, features, eta, t, gold_outputs,
                       predicted_outputs);
    } else {
      CHECK(false) << "Unknown algorithm: " << options_->GetTrainingAlgorithm();
    }
  }

  // Compute the regularization value (halved squared L2 norm of the weights).
  double regularization_value =
      lambda * static_cast<double>(num_instances) *
      parameters_->GetSquaredNorm() / 2.0;

  delete parts;
  delete features;

  gettimeofday(&end, NULL);
  LOG(INFO) << "Time: " << diff_ms(end,start);
  LOG(INFO) << "Time to score: " << time_scores;
  LOG(INFO) << "Time to decode: " << time_decoding;
  LOG(INFO) << "Number of Features: " << parameters_->Size();
  if (options_->GetTrainingAlgorithm() == "perceptron" ||
      options_->GetTrainingAlgorithm() == "mira") {
    LOG(INFO) << "Number of mistakes: " << num_mistakes;
  }
  LOG(INFO) << "Total Cost: " << total_cost << "\t"
            << "Total Loss: " << total_loss << "\t"
            << "Total Reg: " << regularization_value << "\t"
            << "Total Loss+Reg: " << total_loss + regularization_value << endl;
}
Exemplo n.º 3
0
// Compute marginals and evaluate log partition function for a coreference tree
// model.
void CoreferenceDecoder::DecodeBasicMarginals(
    Instance *instance, Parts *parts,
    const std::vector<double> &scores,
    std::vector<double> *predicted_output,
    double *log_partition_function,
    double *entropy) {
  CoreferenceDocumentNumeric *document =
    static_cast<CoreferenceDocumentNumeric*>(instance);
  CoreferenceParts *coreference_parts = static_cast<CoreferenceParts*>(parts);

  predicted_output->clear();
  predicted_output->resize(parts->size(), 0.0);

  *log_partition_function = 0.0;
  *entropy = 0.0;
  const std::vector<Mention*> &mentions = document->GetMentions();
  for (int j = 0; j < mentions.size(); ++j) {
    // List all possible antecedents and pick the one with highest score.
    const std::vector<int> &arcs = coreference_parts->FindArcParts(j);
    int best_antecedent = -1;
    // Find the best label for each candidate arc.
    LogValD total_score = LogValD::Zero();
    //LOG(INFO) << "num_arcs = " << arcs.size();
    for (int k = 0; k < arcs.size(); ++k) {
      int r = arcs[k];
      total_score += LogValD(scores[r], false);
      //LOG(INFO) << "scores[" << r << "] = " << scores[r];
    }
    //LOG(INFO) << "total score = " << total_score.logabs();
    *log_partition_function += total_score.logabs();
    double sum = 0.0;
    for (int k = 0; k < arcs.size(); ++k) {
      int r = arcs[k];
      LogValD marginal = LogValD(scores[r], false) / total_score;
      double marginal_value = marginal.as_float();
      (*predicted_output)[r] = marginal_value;
#if 0
      if (marginal_value > 0.0) {
        LOG(INFO) << "Marginal[" << j << ", "
                  << static_cast<CoreferencePartArc*>((*parts)[r])->parent_mention()
                  << "] = " << marginal_value;
      }
#endif
      if (scores[r] != -std::numeric_limits<double>::infinity()) {
        *entropy -= scores[r] * marginal_value;
      } else {
        CHECK_EQ(marginal_value, 0.0);
      }
      sum += marginal_value;
    }
    if (!NEARLY_EQ_TOL(sum, 1.0, 1e-9)) {
      LOG(INFO) << "Antecedent marginals don't sum to one: sum = " << sum;
    }
  }

  *entropy += *log_partition_function;

#if 0
  LOG(INFO) << "Log-partition function: " << *log_partition_function;
  LOG(INFO) << "Entropy: " << *entropy;
#endif
}