示例#1
0
 vector<BeamTreeResult<T>> best_trees(vector<Mat<T>> input, int beam_width) const {
     auto leaves = convert_to_leaves(input);
     vector<PartialTree> candidates = { PartialTree(leaves) };
     while (candidates[0].nodes.size() > 1) {
         vector<PartialTree> new_candidates;
         for (auto& candidate: candidates) {
             for (auto& new_candidate: cangen(candidate, beam_width)) {
                 new_candidates.emplace_back(new_candidate);
             }
         }
         sort(new_candidates.begin(), new_candidates.end(),
         [this](const PartialTree& c1, const PartialTree& c2) {
             return candidate_log_probability(c1) > candidate_log_probability(c2);
         });
         candidates = vector<PartialTree>(
                          new_candidates.begin(),
                          new_candidates.begin() + min((size_t)beam_width, new_candidates.size())
                      );
         for (size_t cidx = 0; cidx + 1 < candidates.size(); ++cidx) {
             assert2(candidates[cidx].nodes.size() == candidates[cidx + 1].nodes.size(),
                     "Generated candidates of different sizes.");
         }
     }
     vector<BeamTreeResult<T>> results;
     for (auto& tree: candidates) {
         results.emplace_back(tree.nodes[0], tree.derivation);
     }
     return results;
 }
示例#2
0
shared_ptr<visualizable::Tree> visualize_derivation(vector<uint> derivation, vector<string> words) {
    using visualizable::Tree;

    vector<shared_ptr<Tree>> result;
    std::transform(words.begin(), words.end(), std::back_inserter(result),
    [](const string& a) {
        return make_shared<Tree>(a);
    });
    for (auto merge_idx : derivation) {
        vector<shared_ptr<Tree>> new_result;
        for(size_t ridx = 0; ridx < merge_idx; ++ridx) {
            new_result.push_back(result[ridx]);
        }
        new_result.push_back(make_shared<Tree>(std::initializer_list<shared_ptr<Tree>> {
            result[merge_idx],
            result[merge_idx + 1]
        }));
        for(size_t ridx = merge_idx + 2; ridx < result.size(); ++ridx) {
            new_result.push_back(result[ridx]);
        }
        result = new_result;
    }
    assert2(result.size() == 1, "Szymon messed up.");

    return result[0];
}
示例#3
0
Conf& Conf::def_choice(std::string name,
                       std::vector<std::string> choices,
                       std::string default_value) {
    assert2(in_vector(choices, default_value),
        MS() << default_value << " is not an option for " << name);
    assert2(choices.size() >= 2,
        MS() << "At least two choices are needed for " << name);
    auto c = make_shared<Choice>();
    c->choices = choices;
    c->default_value = default_value;
    c->value = default_value;


    items[name] = c;
    return *this;
}
示例#4
0
    /**
    Given an ordered set of n nodes, find the best contiguous
    pairs to join to form n-1 nodes. Return the `beam_width`
    best set of nodes with the resulting join applied.

    Inputs
    ------

    vector<Node> states : nodes to join
    int      beam_width : number of joins to consider

    Outputs
    -------

    vector<Candidate> new states : new sets with joined nodes
    **/
    vector<PartialTree> cangen(PartialTree candidate, int beam_width) const {
        assert2(candidate.nodes.size() >= 2,
                "Must at least have 2 states to join for candidate generation.");
        int num_candidates = min((size_t)beam_width, candidate.nodes.size() - 1);

        vector<Node> possible_joins;
        vector<Mat<T>> scores;
        for (size_t sidx = 0; sidx + 1 < candidate.nodes.size(); ++sidx) {
            possible_joins.emplace_back(
                Mat<T>(),
                join_states(candidate.nodes[sidx], candidate.nodes[sidx + 1])
            );
            scores.emplace_back(prob_decoder.activate(possible_joins.back().state.hidden));
        }
        auto normalized_scores = MatOps<T>::softmax(scores);
        for (size_t sidx = 0; sidx + 1 < candidate.nodes.size(); ++sidx) {
            possible_joins[sidx].log_probability =
                normalized_scores[sidx].log() +
                candidate.nodes[sidx].log_probability +
                candidate.nodes[sidx + 1].log_probability;
        }

        // initialize original index locations
        vector<size_t> idx(possible_joins.size());
        for (size_t i = 0; i < idx.size(); ++i)
            idx[i] = i;

        // sort indexes based on comparing values in v
        sort(idx.begin(), idx.end(), [&possible_joins](size_t i1, size_t i2) {
            return possible_joins[i1].log_probability.w(0) > possible_joins[i2].log_probability.w(0);
        });
        vector<PartialTree> results;

        for (size_t cidx = 0; cidx < num_candidates; ++cidx) {
            vector<Node> result;
            size_t join_idx = idx[cidx];
            for (size_t sidx = 0; sidx < join_idx; ++sidx)
                result.emplace_back(candidate.nodes[sidx]);
            result.emplace_back(possible_joins[join_idx]);
            for (size_t sidx = join_idx + 2; sidx < candidate.nodes.size(); ++sidx) {
                result.emplace_back(candidate.nodes[sidx]);
            }
            assert(result.size() == candidate.nodes.size() - 1);
            auto new_derivation = candidate.derivation; // copy
            // here cidx encodes the decision we made to join nodes cidx and cidx + 1.
            new_derivation.push_back(cidx);
            results.emplace_back(PartialTree(result, new_derivation));
        }

        return results;
    }
示例#5
0
Conf& Conf::def_int(std::string name,
            int lower_bound,
            int upper_bound,
            int default_value) {
    assert2(lower_bound <= default_value && default_value <= upper_bound,
                MS() << "Default value for " << name << "not in range.");
    auto i = make_shared<Int>();
    i->lower_bound = lower_bound;
    i->upper_bound = upper_bound;
    i->default_value = default_value;
    i->value = default_value;

    items[name] = i;
    return *this;
}
示例#6
0
Conf& Conf::def_float(std::string name,
            double lower_bound,
            double upper_bound,
            double default_value) {
    assert2(lower_bound <= default_value && default_value <= upper_bound,
                MS() << "Default value for " << name << "not in range.");
    auto f = make_shared<Float>();
    f->lower_bound = lower_bound;
    f->upper_bound = upper_bound;
    f->default_value = default_value;
    f->value = default_value;

    items[name] = f;
    return *this;
}
示例#7
0
void training_loop(std::shared_ptr<Solver::AbstractSolver<REAL_t>> solver,
                   model_t& model,
                   std::function<vector<uint>(vector<uint>&)> pred_fun,
                   vector<numeric_example_t>& train,
                   vector<numeric_example_t>& validate) {
    auto& vocab = arithmetic::vocabulary;

    auto params = model.parameters();

    int epoch = 0;
    int difficulty_waiting = 0;
    auto end_symbol_idx = vocab.word2index[utils::end_symbol];

    int beam_width = FLAGS_beam_width;

    if (beam_width < 1)
        utils::exit_with_message(MS() << "Beam width must be strictly positive (got " << beam_width << ")");

    Throttled throttled_examples;
    Throttled throttled_validation;

    bool target_accuracy_reached = false;

    while (!target_accuracy_reached && epoch++ < FLAGS_graduation_time) {

        auto indices = utils::random_arange(train.size());
        auto indices_begin = indices.begin();

        REAL_t minibatch_error = 0.0;

        // one minibatch
        for (auto indices_begin = indices.begin();
                indices_begin < indices.begin() + std::min((size_t)FLAGS_minibatch, train.size());
                indices_begin++) {
            // <training>
            auto& example = train[*indices_begin];

            auto error = model.error(example, beam_width);
            error.grad();
            graph::backward();
            minibatch_error += error.w(0);
            // </training>
            // // <reporting>
            throttled_examples.maybe_run(seconds(10), [&]() {
                graph::NoBackprop nb;
                auto random_example_index = utils::randint(0, validate.size() -1);
                auto& expression = validate[random_example_index].first;
                auto predictions = model.predict(expression,
                                                 beam_width,
                                                 MAX_OUTPUT_LENGTH,
                                                 vocab.word2index.at(utils::end_symbol));

                auto expression_string = arithmetic::vocabulary.decode(&expression);
                if (expression_string.back() == utils::end_symbol)
                    expression_string.resize(expression_string.size() - 1);
                std::cout << utils::join(expression_string) << std::endl;


                vector<string> prediction_string;
                vector<double> prediction_probability;

                for (auto& prediction : predictions) {
                    if (validate[random_example_index].second == prediction.prediction) {
                        std::cout << utils::green;
                    }
                    prediction_probability.push_back(prediction.get_probability().w(0));
                    std::cout << "= (" << std::setprecision( 3 ) << prediction.get_probability().log().w(0) << ") ";
                    auto digits = vocab.decode(&prediction.prediction);
                    if (digits.back() == utils::end_symbol)
                        digits.pop_back();
                    auto joined_digits = utils::join(digits);
                    prediction_string.push_back(joined_digits);
                    std::cout << joined_digits << utils::reset_color << std::endl;
                }
                auto vgrid = make_shared<visualizable::GridLayout>();

                assert2(predictions[0].derivations.size() == predictions[0].nodes.size(),
                        "Szymon messed up.");
                for (int didx = 0;
                        didx < min((size_t)FLAGS_visualizer_trees, predictions[0].derivations.size());
                        ++didx) {
                    auto visualization = visualize_derivation(
                                             predictions[0].derivations[didx],
                                             vocab.decode(&expression)
                                         );
                    auto tree_prob = predictions[0].nodes[didx].log_probability.exp().w(0,0);
                    vgrid->add_in_column(0, make_shared<visualizable::Probability<double>>(tree_prob));
                    vgrid->add_in_column(0, visualization);
                }
                vgrid->add_in_column(1, make_shared<visualizable::Sentence<double>>(expression_string));
                vgrid->add_in_column(1, make_shared<visualizable::FiniteDistribution<double>>(
                                         prediction_probability,
                                         prediction_string
                                     ));

                if (visualizer)
                    visualizer->feed(vgrid->to_json());

            });
            double current_accuracy = -1;
            throttled_validation.maybe_run(seconds(30), [&]() {
                current_accuracy = arithmetic::average_recall(validate, pred_fun, FLAGS_j);
                std::cout << "epoch: " << epoch << ", accuracy = " << std::setprecision( 3 )
                          << 100.0 * current_accuracy << "%" << std::endl;
            });
            if (current_accuracy != -1 && current_accuracy > 0.9) {
                std::cout << "Current accuracy is now " << current_accuracy << std::endl;
                target_accuracy_reached = true;
                break;
            }
            // </reporting>
        }
        solver->step(params); // One step of gradient descent
        epoch++;
    }
}
示例#8
0
文件: LSTM.cpp 项目: bhack/Dali
typename LSTM<R>::activation_t LSTM<R>::activate(
        const vector<Mat<R>>& inputs,
        const vector<activation_t>& states) const {
    Mat<R> input_gate, output_gate;
    vector<Mat<R>> forget_gates;

    for (auto& state: states) {
        assert2(state.memory.dims(1) == hidden_size,
            utils::MS() << "LSTM: State memory should have hidden size "
                        << hidden_size << " not " << state.memory.dims(1));
        assert2(state.hidden.dims(1) == hidden_size,
            utils::MS() << "LSTM: State hidden should have hidden size "
                        << hidden_size << " not " << state.memory.dims(1));
    }
    assert2(input_sizes.size() == inputs.size(),
        utils::MS() << "LSTM: Got " << inputs.size() << " inputs but expected " << input_sizes.size() << " instead."
    );
    for (int iidx = 0; iidx < input_sizes.size(); ++iidx) {
        assert2(inputs[iidx].dims(1) == input_sizes[iidx],
                utils::MS() << "LSTM: " << iidx << "-th input to LSTM should have size "
                            << input_sizes[iidx] << " not " << inputs[iidx].dims(1));
    }
    auto gate_input = utils::concatenate({inputs, activation_t::hiddens(states)});

    if (memory_feeds_gates) {
        input_gate  = input_layer.activate(gate_input);
        // if the memory feeds the gates (Alex Graves 2013) then
        // a diagonal matrices (Wci and Wcf) connect memory to input
        // and forget gates
        for (int cidx = 0; cidx < num_children; ++cidx) {
            auto constant_memory = MatOps<R>::consider_constant_if(states[cidx].memory, !backprop_through_gates);
            input_gate           = input_gate + constant_memory * Wcells_to_inputs[cidx];
            forget_gates.emplace_back(
                (
                    forget_layers[cidx].activate(gate_input) + constant_memory * Wcells_to_forgets[cidx]
                ).sigmoid()
            );
        }
        input_gate  = input_gate.sigmoid();
    } else {
        // (Zaremba 2014 style)

        // input gate:
        input_gate  = input_layer.activate(gate_input).sigmoid();
        // forget gate
        for (int cidx = 0; cidx < num_children; ++cidx) {
            forget_gates.emplace_back(forget_layers[cidx].activate(gate_input).sigmoid());
        }
    }

    // write operation on cells
    auto cell_write  = cell_layer.activate(gate_input).tanh();

    // compute new cell activation
    vector<Mat<R>> memory_contributions;
    for (int cidx = 0; cidx < num_children; ++cidx) {
        memory_contributions.emplace_back(forget_gates[cidx] * states[cidx].memory);
    }

    auto retain_cell = MatOps<R>::add(memory_contributions);
    auto write_cell  = input_gate  * cell_write; // what do we write to cell
    auto cell_d      = retain_cell + write_cell; // new cell contents

    if (memory_feeds_gates) {
        // output gate uses new memory (cell_d) to control its gate
        output_gate = (
            output_layer.activate(gate_input) + (MatOps<R>::consider_constant_if(cell_d, !backprop_through_gates) * Wco)
        ).sigmoid();
    } else {
        // output gate
        output_gate = output_layer.activate(gate_input).sigmoid();
    }

    // compute hidden state as gated, saturated cell activations
    auto hidden_d = output_gate * cell_d.tanh();

    return activation_t(cell_d, hidden_d);
}