vector<BeamTreeResult<T>> best_trees(vector<Mat<T>> input, int beam_width) const { auto leaves = convert_to_leaves(input); vector<PartialTree> candidates = { PartialTree(leaves) }; while (candidates[0].nodes.size() > 1) { vector<PartialTree> new_candidates; for (auto& candidate: candidates) { for (auto& new_candidate: cangen(candidate, beam_width)) { new_candidates.emplace_back(new_candidate); } } sort(new_candidates.begin(), new_candidates.end(), [this](const PartialTree& c1, const PartialTree& c2) { return candidate_log_probability(c1) > candidate_log_probability(c2); }); candidates = vector<PartialTree>( new_candidates.begin(), new_candidates.begin() + min((size_t)beam_width, new_candidates.size()) ); for (size_t cidx = 0; cidx + 1 < candidates.size(); ++cidx) { assert2(candidates[cidx].nodes.size() == candidates[cidx + 1].nodes.size(), "Generated candidates of different sizes."); } } vector<BeamTreeResult<T>> results; for (auto& tree: candidates) { results.emplace_back(tree.nodes[0], tree.derivation); } return results; }
shared_ptr<visualizable::Tree> visualize_derivation(vector<uint> derivation, vector<string> words) { using visualizable::Tree; vector<shared_ptr<Tree>> result; std::transform(words.begin(), words.end(), std::back_inserter(result), [](const string& a) { return make_shared<Tree>(a); }); for (auto merge_idx : derivation) { vector<shared_ptr<Tree>> new_result; for(size_t ridx = 0; ridx < merge_idx; ++ridx) { new_result.push_back(result[ridx]); } new_result.push_back(make_shared<Tree>(std::initializer_list<shared_ptr<Tree>> { result[merge_idx], result[merge_idx + 1] })); for(size_t ridx = merge_idx + 2; ridx < result.size(); ++ridx) { new_result.push_back(result[ridx]); } result = new_result; } assert2(result.size() == 1, "Szymon messed up."); return result[0]; }
Conf& Conf::def_choice(std::string name, std::vector<std::string> choices, std::string default_value) { assert2(in_vector(choices, default_value), MS() << default_value << " is not an option for " << name); assert2(choices.size() >= 2, MS() << "At least two choices are needed for " << name); auto c = make_shared<Choice>(); c->choices = choices; c->default_value = default_value; c->value = default_value; items[name] = c; return *this; }
/** Given an ordered set of n nodes, find the best contiguous pairs to join to form n-1 nodes. Return the `beam_width` best set of nodes with the resulting join applied. Inputs ------ vector<Node> states : nodes to join int beam_width : number of joins to consider Outputs ------- vector<Candidate> new states : new sets with joined nodes **/ vector<PartialTree> cangen(PartialTree candidate, int beam_width) const { assert2(candidate.nodes.size() >= 2, "Must at least have 2 states to join for candidate generation."); int num_candidates = min((size_t)beam_width, candidate.nodes.size() - 1); vector<Node> possible_joins; vector<Mat<T>> scores; for (size_t sidx = 0; sidx + 1 < candidate.nodes.size(); ++sidx) { possible_joins.emplace_back( Mat<T>(), join_states(candidate.nodes[sidx], candidate.nodes[sidx + 1]) ); scores.emplace_back(prob_decoder.activate(possible_joins.back().state.hidden)); } auto normalized_scores = MatOps<T>::softmax(scores); for (size_t sidx = 0; sidx + 1 < candidate.nodes.size(); ++sidx) { possible_joins[sidx].log_probability = normalized_scores[sidx].log() + candidate.nodes[sidx].log_probability + candidate.nodes[sidx + 1].log_probability; } // initialize original index locations vector<size_t> idx(possible_joins.size()); for (size_t i = 0; i < idx.size(); ++i) idx[i] = i; // sort indexes based on comparing values in v sort(idx.begin(), idx.end(), [&possible_joins](size_t i1, size_t i2) { return possible_joins[i1].log_probability.w(0) > possible_joins[i2].log_probability.w(0); }); vector<PartialTree> results; for (size_t cidx = 0; cidx < num_candidates; ++cidx) { vector<Node> result; size_t join_idx = idx[cidx]; for (size_t sidx = 0; sidx < join_idx; ++sidx) result.emplace_back(candidate.nodes[sidx]); result.emplace_back(possible_joins[join_idx]); for (size_t sidx = join_idx + 2; sidx < candidate.nodes.size(); ++sidx) { result.emplace_back(candidate.nodes[sidx]); } assert(result.size() == candidate.nodes.size() - 1); auto new_derivation = candidate.derivation; // copy // here cidx encodes the decision we made to join nodes cidx and cidx + 1. new_derivation.push_back(cidx); results.emplace_back(PartialTree(result, new_derivation)); } return results; }
Conf& Conf::def_int(std::string name, int lower_bound, int upper_bound, int default_value) { assert2(lower_bound <= default_value && default_value <= upper_bound, MS() << "Default value for " << name << "not in range."); auto i = make_shared<Int>(); i->lower_bound = lower_bound; i->upper_bound = upper_bound; i->default_value = default_value; i->value = default_value; items[name] = i; return *this; }
Conf& Conf::def_float(std::string name, double lower_bound, double upper_bound, double default_value) { assert2(lower_bound <= default_value && default_value <= upper_bound, MS() << "Default value for " << name << "not in range."); auto f = make_shared<Float>(); f->lower_bound = lower_bound; f->upper_bound = upper_bound; f->default_value = default_value; f->value = default_value; items[name] = f; return *this; }
void training_loop(std::shared_ptr<Solver::AbstractSolver<REAL_t>> solver, model_t& model, std::function<vector<uint>(vector<uint>&)> pred_fun, vector<numeric_example_t>& train, vector<numeric_example_t>& validate) { auto& vocab = arithmetic::vocabulary; auto params = model.parameters(); int epoch = 0; int difficulty_waiting = 0; auto end_symbol_idx = vocab.word2index[utils::end_symbol]; int beam_width = FLAGS_beam_width; if (beam_width < 1) utils::exit_with_message(MS() << "Beam width must be strictly positive (got " << beam_width << ")"); Throttled throttled_examples; Throttled throttled_validation; bool target_accuracy_reached = false; while (!target_accuracy_reached && epoch++ < FLAGS_graduation_time) { auto indices = utils::random_arange(train.size()); auto indices_begin = indices.begin(); REAL_t minibatch_error = 0.0; // one minibatch for (auto indices_begin = indices.begin(); indices_begin < indices.begin() + std::min((size_t)FLAGS_minibatch, train.size()); indices_begin++) { // <training> auto& example = train[*indices_begin]; auto error = model.error(example, beam_width); error.grad(); graph::backward(); minibatch_error += error.w(0); // </training> // // <reporting> throttled_examples.maybe_run(seconds(10), [&]() { graph::NoBackprop nb; auto random_example_index = utils::randint(0, validate.size() -1); auto& expression = validate[random_example_index].first; auto predictions = model.predict(expression, beam_width, MAX_OUTPUT_LENGTH, vocab.word2index.at(utils::end_symbol)); auto expression_string = arithmetic::vocabulary.decode(&expression); if (expression_string.back() == utils::end_symbol) expression_string.resize(expression_string.size() - 1); std::cout << utils::join(expression_string) << std::endl; vector<string> prediction_string; vector<double> prediction_probability; for (auto& prediction : predictions) { if (validate[random_example_index].second == prediction.prediction) { std::cout << utils::green; } prediction_probability.push_back(prediction.get_probability().w(0)); std::cout << "= (" << std::setprecision( 3 ) << prediction.get_probability().log().w(0) << ") "; auto digits = vocab.decode(&prediction.prediction); if (digits.back() == utils::end_symbol) digits.pop_back(); auto joined_digits = utils::join(digits); prediction_string.push_back(joined_digits); std::cout << joined_digits << utils::reset_color << std::endl; } auto vgrid = make_shared<visualizable::GridLayout>(); assert2(predictions[0].derivations.size() == predictions[0].nodes.size(), "Szymon messed up."); for (int didx = 0; didx < min((size_t)FLAGS_visualizer_trees, predictions[0].derivations.size()); ++didx) { auto visualization = visualize_derivation( predictions[0].derivations[didx], vocab.decode(&expression) ); auto tree_prob = predictions[0].nodes[didx].log_probability.exp().w(0,0); vgrid->add_in_column(0, make_shared<visualizable::Probability<double>>(tree_prob)); vgrid->add_in_column(0, visualization); } vgrid->add_in_column(1, make_shared<visualizable::Sentence<double>>(expression_string)); vgrid->add_in_column(1, make_shared<visualizable::FiniteDistribution<double>>( prediction_probability, prediction_string )); if (visualizer) visualizer->feed(vgrid->to_json()); }); double current_accuracy = -1; throttled_validation.maybe_run(seconds(30), [&]() { current_accuracy = arithmetic::average_recall(validate, pred_fun, FLAGS_j); std::cout << "epoch: " << epoch << ", accuracy = " << std::setprecision( 3 ) << 100.0 * current_accuracy << "%" << std::endl; }); if (current_accuracy != -1 && current_accuracy > 0.9) { std::cout << "Current accuracy is now " << current_accuracy << std::endl; target_accuracy_reached = true; break; } // </reporting> } solver->step(params); // One step of gradient descent epoch++; } }
typename LSTM<R>::activation_t LSTM<R>::activate( const vector<Mat<R>>& inputs, const vector<activation_t>& states) const { Mat<R> input_gate, output_gate; vector<Mat<R>> forget_gates; for (auto& state: states) { assert2(state.memory.dims(1) == hidden_size, utils::MS() << "LSTM: State memory should have hidden size " << hidden_size << " not " << state.memory.dims(1)); assert2(state.hidden.dims(1) == hidden_size, utils::MS() << "LSTM: State hidden should have hidden size " << hidden_size << " not " << state.memory.dims(1)); } assert2(input_sizes.size() == inputs.size(), utils::MS() << "LSTM: Got " << inputs.size() << " inputs but expected " << input_sizes.size() << " instead." ); for (int iidx = 0; iidx < input_sizes.size(); ++iidx) { assert2(inputs[iidx].dims(1) == input_sizes[iidx], utils::MS() << "LSTM: " << iidx << "-th input to LSTM should have size " << input_sizes[iidx] << " not " << inputs[iidx].dims(1)); } auto gate_input = utils::concatenate({inputs, activation_t::hiddens(states)}); if (memory_feeds_gates) { input_gate = input_layer.activate(gate_input); // if the memory feeds the gates (Alex Graves 2013) then // a diagonal matrices (Wci and Wcf) connect memory to input // and forget gates for (int cidx = 0; cidx < num_children; ++cidx) { auto constant_memory = MatOps<R>::consider_constant_if(states[cidx].memory, !backprop_through_gates); input_gate = input_gate + constant_memory * Wcells_to_inputs[cidx]; forget_gates.emplace_back( ( forget_layers[cidx].activate(gate_input) + constant_memory * Wcells_to_forgets[cidx] ).sigmoid() ); } input_gate = input_gate.sigmoid(); } else { // (Zaremba 2014 style) // input gate: input_gate = input_layer.activate(gate_input).sigmoid(); // forget gate for (int cidx = 0; cidx < num_children; ++cidx) { forget_gates.emplace_back(forget_layers[cidx].activate(gate_input).sigmoid()); } } // write operation on cells auto cell_write = cell_layer.activate(gate_input).tanh(); // compute new cell activation vector<Mat<R>> memory_contributions; for (int cidx = 0; cidx < num_children; ++cidx) { memory_contributions.emplace_back(forget_gates[cidx] * states[cidx].memory); } auto retain_cell = MatOps<R>::add(memory_contributions); auto write_cell = input_gate * cell_write; // what do we write to cell auto cell_d = retain_cell + write_cell; // new cell contents if (memory_feeds_gates) { // output gate uses new memory (cell_d) to control its gate output_gate = ( output_layer.activate(gate_input) + (MatOps<R>::consider_constant_if(cell_d, !backprop_through_gates) * Wco) ).sigmoid(); } else { // output gate output_gate = output_layer.activate(gate_input).sigmoid(); } // compute hidden state as gated, saturated cell activations auto hidden_d = output_gate * cell_d.tanh(); return activation_t(cell_d, hidden_d); }