// Decode the label marginals for each candidate arc. The output vector // total_scores contains the sum of exp-scores (over the labels) for each arc; // label_marginals contains those marginals ignoring the tree constraint. void ConstituencyLabelerDecoder::DecodeLabelMarginals( Instance *instance, Parts *parts, const std::vector<double> &scores, std::vector<double> *total_scores, std::vector<double> *label_marginals) { ConstituencyLabelerInstanceNumeric *sentence = static_cast<ConstituencyLabelerInstanceNumeric*>(instance); ConstituencyLabelerParts *labeled_parts = static_cast<ConstituencyLabelerParts*>(parts); ConstituencyLabelerOptions *labeler_options = static_cast<ConstituencyLabelerOptions*>(pipe_->GetOptions()); int num_nodes = sentence->GetNumConstituents(); int offset_labeled_nodes, num_labeled_nodes; labeled_parts->GetOffsetNode(&offset_labeled_nodes, &num_labeled_nodes); total_scores->clear(); total_scores->resize(num_nodes, 0.0); label_marginals->clear(); label_marginals->resize(num_labeled_nodes, 0.0); for (int i = 0; i < num_nodes; ++i) { const std::vector<int> &index_node_parts = labeled_parts->FindNodeParts(i); // If no part for null label, initiliaze log partition to exp(0.0) to // account the null label which has score 0.0. LogValD total_score = (labeler_options->ignore_null_labels()) ? LogValD::One() : LogValD::Zero(); for (int k = 0; k < index_node_parts.size(); ++k) { total_score += LogValD(scores[index_node_parts[k]], false); } (*total_scores)[i] = total_score.logabs(); // If no part for null label, initiliaze sum to exp(0.0)/Z to // account the null label which has score 0.0. double sum = (labeler_options->ignore_null_labels()) ? (1.0 / total_score.as_float()) : 0.0; for (int k = 0; k < index_node_parts.size(); ++k) { LogValD marginal = LogValD(scores[index_node_parts[k]], false) / total_score; (*label_marginals)[index_node_parts[k] - offset_labeled_nodes] = marginal.as_float(); sum += marginal.as_float(); } if (!NEARLY_EQ_TOL(sum, 1.0, 1e-9)) { LOG(INFO) << "Label marginals don't sum to one: sum = " << sum; } } }
void Pipe::TrainEpoch(int epoch) { Instance *instance; Parts *parts = CreateParts(); Features *features = CreateFeatures(); vector<double> scores; vector<double> gold_outputs; vector<double> predicted_outputs; double total_cost = 0.0; double total_loss = 0.0; double eta; int num_instances = instances_.size(); double lambda = 1.0/(options_->GetRegularizationConstant() * (static_cast<double>(num_instances))); timeval start, end; gettimeofday(&start, NULL); int time_decoding = 0; int time_scores = 0; int num_mistakes = 0; LOG(INFO) << " Iteration #" << epoch + 1; dictionary_->StopGrowth(); for (int i = 0; i < instances_.size(); i++) { int t = num_instances * epoch + i; instance = instances_[i]; MakeParts(instance, parts, &gold_outputs); MakeFeatures(instance, parts, features); // If using only supported features, must remove the unsupported ones. // This is necessary not to mess up the computation of the squared norm // of the feature difference vector in MIRA. if (options_->only_supported_features()) { RemoveUnsupportedFeatures(instance, parts, features); } timeval start_scores, end_scores; gettimeofday(&start_scores, NULL); ComputeScores(instance, parts, features, &scores); gettimeofday(&end_scores, NULL); time_scores += diff_ms(end_scores, start_scores); if (options_->GetTrainingAlgorithm() == "perceptron" || options_->GetTrainingAlgorithm() == "mira" ) { timeval start_decoding, end_decoding; gettimeofday(&start_decoding, NULL); decoder_->Decode(instance, parts, scores, &predicted_outputs); gettimeofday(&end_decoding, NULL); time_decoding += diff_ms(end_decoding, start_decoding); if (options_->GetTrainingAlgorithm() == "perceptron") { for (int r = 0; r < parts->size(); ++r) { if (!NEARLY_EQ_TOL(gold_outputs[r], predicted_outputs[r], 1e-6)) { ++num_mistakes; } } eta = 1.0; } else { CHECK(false) << "Plain mira is not implemented yet."; } MakeGradientStep(parts, features, eta, t, gold_outputs, predicted_outputs); } else if (options_->GetTrainingAlgorithm() == "svm_mira" || options_->GetTrainingAlgorithm() == "crf_mira" || options_->GetTrainingAlgorithm() == "svm_sgd" || options_->GetTrainingAlgorithm() == "crf_sgd") { double loss; timeval start_decoding, end_decoding; gettimeofday(&start_decoding, NULL); if (options_->GetTrainingAlgorithm() == "svm_mira" || options_->GetTrainingAlgorithm() == "svm_sgd") { // Do cost-augmented inference. double cost; decoder_->DecodeCostAugmented(instance, parts, scores, gold_outputs, &predicted_outputs, &cost, &loss); total_cost += cost; } else { // Do marginal inference. double entropy; decoder_->DecodeMarginals(instance, parts, scores, gold_outputs, &predicted_outputs, &entropy, &loss); CHECK_GE(entropy, 0.0); } gettimeofday(&end_decoding, NULL); time_decoding += diff_ms(end_decoding, start_decoding); if (loss < 0.0) { if (!NEARLY_EQ_TOL(loss, 0.0, 1e-9)) { LOG(INFO) << "Warning: negative loss set to zero: " << loss; } loss = 0.0; } total_loss += loss; // Compute difference between predicted and gold feature vectors. FeatureVector difference; MakeFeatureDifference(parts, features, gold_outputs, predicted_outputs, &difference); // Get the stepsize. if (options_->GetTrainingAlgorithm() == "svm_mira" || options_->GetTrainingAlgorithm() == "crf_mira") { double squared_norm = difference.GetSquaredNorm(); double threshold = 1e-9; if (loss < threshold || squared_norm < threshold) { eta = 0.0; } else { eta = loss / squared_norm; if (eta > options_->GetRegularizationConstant()) { eta = options_->GetRegularizationConstant(); } } } else { if (options_->GetLearningRateSchedule() == "fixed") { eta = options_->GetInitialLearningRate(); } else if (options_->GetLearningRateSchedule() == "invsqrt") { eta = options_->GetInitialLearningRate() / sqrt(static_cast<double>(t+1)); } else if (options_->GetLearningRateSchedule() == "inv") { eta = options_->GetInitialLearningRate() / static_cast<double>(t+1); } else if (options_->GetLearningRateSchedule() == "lecun") { eta = options_->GetInitialLearningRate() / (1.0 + (static_cast<double>(t) / static_cast<double>(num_instances))); } else { CHECK(false) << "Unknown learning rate schedule: " << options_->GetLearningRateSchedule(); } // Scale the parameter vector (only for SGD). double decay = 1 - eta * lambda; CHECK_GT(decay, 0.0); parameters_->Scale(decay); } MakeGradientStep(parts, features, eta, t, gold_outputs, predicted_outputs); } else { CHECK(false) << "Unknown algorithm: " << options_->GetTrainingAlgorithm(); } } // Compute the regularization value (halved squared L2 norm of the weights). double regularization_value = lambda * static_cast<double>(num_instances) * parameters_->GetSquaredNorm() / 2.0; delete parts; delete features; gettimeofday(&end, NULL); LOG(INFO) << "Time: " << diff_ms(end,start); LOG(INFO) << "Time to score: " << time_scores; LOG(INFO) << "Time to decode: " << time_decoding; LOG(INFO) << "Number of Features: " << parameters_->Size(); if (options_->GetTrainingAlgorithm() == "perceptron" || options_->GetTrainingAlgorithm() == "mira") { LOG(INFO) << "Number of mistakes: " << num_mistakes; } LOG(INFO) << "Total Cost: " << total_cost << "\t" << "Total Loss: " << total_loss << "\t" << "Total Reg: " << regularization_value << "\t" << "Total Loss+Reg: " << total_loss + regularization_value << endl; }
// Compute marginals and evaluate log partition function for a coreference tree // model. void CoreferenceDecoder::DecodeBasicMarginals( Instance *instance, Parts *parts, const std::vector<double> &scores, std::vector<double> *predicted_output, double *log_partition_function, double *entropy) { CoreferenceDocumentNumeric *document = static_cast<CoreferenceDocumentNumeric*>(instance); CoreferenceParts *coreference_parts = static_cast<CoreferenceParts*>(parts); predicted_output->clear(); predicted_output->resize(parts->size(), 0.0); *log_partition_function = 0.0; *entropy = 0.0; const std::vector<Mention*> &mentions = document->GetMentions(); for (int j = 0; j < mentions.size(); ++j) { // List all possible antecedents and pick the one with highest score. const std::vector<int> &arcs = coreference_parts->FindArcParts(j); int best_antecedent = -1; // Find the best label for each candidate arc. LogValD total_score = LogValD::Zero(); //LOG(INFO) << "num_arcs = " << arcs.size(); for (int k = 0; k < arcs.size(); ++k) { int r = arcs[k]; total_score += LogValD(scores[r], false); //LOG(INFO) << "scores[" << r << "] = " << scores[r]; } //LOG(INFO) << "total score = " << total_score.logabs(); *log_partition_function += total_score.logabs(); double sum = 0.0; for (int k = 0; k < arcs.size(); ++k) { int r = arcs[k]; LogValD marginal = LogValD(scores[r], false) / total_score; double marginal_value = marginal.as_float(); (*predicted_output)[r] = marginal_value; #if 0 if (marginal_value > 0.0) { LOG(INFO) << "Marginal[" << j << ", " << static_cast<CoreferencePartArc*>((*parts)[r])->parent_mention() << "] = " << marginal_value; } #endif if (scores[r] != -std::numeric_limits<double>::infinity()) { *entropy -= scores[r] * marginal_value; } else { CHECK_EQ(marginal_value, 0.0); } sum += marginal_value; } if (!NEARLY_EQ_TOL(sum, 1.0, 1e-9)) { LOG(INFO) << "Antecedent marginals don't sum to one: sum = " << sum; } } *entropy += *log_partition_function; #if 0 LOG(INFO) << "Log-partition function: " << *log_partition_function; LOG(INFO) << "Entropy: " << *entropy; #endif }