int main( int argc, char* argv[]) {
    GFLAGS_NAMESPACE::SetUsageMessage(
        "\n"
        "Sentiment Analysis as Competition amongst Language Models\n"
        "---------------------------------------------------------\n"
        "\n"
        "We present a dual formulation of the word sequence classification\n"
        "task: we treat each label’s examples as originating from different\n"
        "languages and we train language models for each label; at test\n"
        "time we compare the likelihood of a sequence under each label’s\n"
        "language model to find the most likely assignment.\n"
        "\n"
        " @author Jonathan Raiman\n"
        " @date February 13th 2015"
    );


    GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);

    auto epochs              = FLAGS_epochs;
    auto sentiment_treebank  = SST::load(FLAGS_train);

    auto word_vocab          = SST::get_vocabulary(sentiment_treebank, FLAGS_min_occurence);
    auto vocab_size          = word_vocab.size();

    // Load Dataset of Trees:
    std::cout << "Unique Treees Loaded : " << sentiment_treebank.size() << std::endl
              << "        Example tree : " << *sentiment_treebank[sentiment_treebank.size()-1] << std::endl
              << "     Vocabulary size : " << vocab_size << std::endl;

    // Put trees into matrices:
    const int NUM_SENTIMENTS = 5;
    vector<vector<Databatch>> datasets(NUM_SENTIMENTS);
    vector<vector<Databatch>> validation_sets(NUM_SENTIMENTS);

    {
        vector<tokenized_uint_labeled_dataset> tree_types(NUM_SENTIMENTS);
        vector<tokenized_uint_labeled_dataset> validation_tree_types(NUM_SENTIMENTS);

        for (auto& tree : sentiment_treebank) {
            if (((int) tree->label) > 4)
                utils::exit_with_message("Error: One of the trees has a label other than 0-4");
            tree_types[tree->label].emplace_back(tree->to_labeled_pair());
            for (auto& child : tree->general_children) {
                if (((int)child->label) > 4)
                    utils::exit_with_message("Error: One of the trees's children has a label other than 0-4");
                tree_types[(int) child->label].emplace_back(child->to_labeled_pair());
            }
        }
        auto validation_treebank = SST::load(FLAGS_validation);
        for (auto& tree : validation_treebank) {
            if (((int) tree->label) > 4)
                utils::exit_with_message("Error: One of the trees has a label other than 0-4");
            validation_tree_types[tree->label].emplace_back(tree->to_labeled_pair());
            for (auto& child : tree->general_children) {
                if (((int)child->label) > 4)
                    utils::exit_with_message("Error: One of the trees's children has a label other than 0-4");
                validation_tree_types[(int) child->label].emplace_back(child->to_labeled_pair());
            }
        }
        int i = 0;
        for (auto& tree_type : tree_types)
            std::cout << "Label type " << i++ << " has " << tree_type.size() << " different examples" << std::endl;
        i = 0;

        for (auto& tree_type : validation_tree_types) {
            std::cout << "Label type " << i++ << " has " << tree_type.size() << " validation examples" << std::endl;
        }
        i = 0;
        for (auto& tree_type : tree_types) {
            datasets[i++] = Databatch::create_dataset(tree_type, word_vocab, FLAGS_minibatch, true);
        }

        i = 0;
        for (auto& tree_type : validation_tree_types)
            validation_sets[i++] = Databatch::create_dataset(tree_type, word_vocab, FLAGS_minibatch, true);
    }

    std::cout     << "    Max training epochs = " << FLAGS_epochs << std::endl;
    std::cout     << "    Training cutoff     = " << FLAGS_cutoff << std::endl;
    std::cout     << "Minibatches/label/x-val = " << FLAGS_epoch_batches << std::endl;
    #ifdef USE_GATES
        std::cout << "      using gated model = true" << std::endl;
    #else
        std::cout << "      using gated model = false" << std::endl;
    #endif
    std::cout     << "     Use Shortcut LSTMs = " << (FLAGS_shortcut ? "true" : "false") << std::endl;
    std::cout     << " Comparing models using = " << (FLAGS_use_surprise ? "surprise" : "log likelihood") << std::endl;

    pool = new ThreadPool(FLAGS_j);

    int patience = 0;
    // with a rampup model we start with zero memory penalty and gradually increase the memory
    // L1 penalty until it reaches the desired level.
    // this allows early exploration, but only later forces sparsity on the model

    std::vector<MODEL_USED<REAL_t>> models;
    vector<vector<MODEL_USED<REAL_t>>> thread_models;
    vector<Solver::Adam<REAL_t>> solvers;


    for (int sentiment = 0; sentiment < NUM_SENTIMENTS; sentiment++) {

        if (!FLAGS_load.empty()) {
            std::cout << "Loading model : \"" << FLAGS_load << sentiment << "\"" << std::endl;
            models.emplace_back(MODEL_USED<REAL_t>::load(FLAGS_load + std::to_string(sentiment)));
        } else {
            models.emplace_back(
                word_vocab.size(),
                FLAGS_input_size,
                FLAGS_hidden,
                FLAGS_stack_size < 1 ? 1 : FLAGS_stack_size,
                word_vocab.size(),
                FLAGS_shortcut,
                FLAGS_memory_feeds_gates
            );
        }
        thread_models.emplace_back();
        for (int thread_no = 0; thread_no < FLAGS_j; ++thread_no) {
            thread_models[sentiment].push_back(models[sentiment].shallow_copy());
        }
        auto params = models[sentiment].parameters();
        //solvers.emplace_back(params, FLAGS_rho);
        solvers.emplace_back(params, 0.1, 0.001, 1e-9, 5.0);
    }
    int epoch = 0;
    REAL_t accuracy = 0.0;
    REAL_t new_accuracy;
    Throttled t;

    while (accuracy < FLAGS_cutoff && patience < FLAGS_patience) {
        stringstream ss;
        ss << "Epoch " << ++epoch;
        atomic<int> batches_processed(0);

        ReportProgress<double> journalist(ss.str(), NUM_SENTIMENTS * FLAGS_epoch_batches);

        for (int sentiment = 0; sentiment < NUM_SENTIMENTS; sentiment++) {
            for (int batch_id = 0; batch_id < FLAGS_epoch_batches; ++batch_id) {
                pool->run([&thread_models, &journalist, &solvers, &datasets, sentiment, &epoch, &accuracy, &batches_processed]() {
                    auto& thread_model = thread_models[sentiment][ThreadPool::get_thread_number()];
                    auto& solver = solvers[sentiment];

                    auto thread_parameters = thread_model.parameters();
                    auto& minibatch = datasets[sentiment][utils::randint(0, datasets[sentiment].size()-1)];

                    #ifdef USE_GATES
                        thread_model.memory_penalty = (FLAGS_memory_penalty / minibatch.data->cols()) * std::min((REAL_t)1.0, ((REAL_t) (epoch*epoch) / ((REAL_t) FLAGS_memory_rampup * FLAGS_memory_rampup)));
                    #endif

                    thread_model.masked_predict_cost(
                        minibatch.data,
                        minibatch.data,
                        minibatch.mask,
                        FLAGS_dropout,
                        1,
                        0);
                    graph::backward(); // backpropagate
                    solver.step(thread_parameters); // One step of gradient descent

                    journalist.tick(++batches_processed, accuracy);
                });
            }
        }

        while(true) {
            journalist.pause();
            reconstruct_random_beams(models, datasets, word_vocab,
                utils::randint(2, 6), // how many elements to use as a primer for beam
                FLAGS_num_reconstructions, // how many beams
                20 // max size of a sequence
            );
            journalist.resume();
            // TODO(jonathan): reconstructions go here..
            if (pool->wait_until_idle(seconds(20)))
                break;
        }

        journalist.done();
        new_accuracy = average_error(models, validation_sets);

        if (new_accuracy < accuracy) {
            patience +=1;
        } else {
            patience = 0;
        }
        accuracy = new_accuracy;

        t.maybe_run(seconds(600), [&models]() {
            int i = 0;
            for (auto& model : models) {
                model.save(FLAGS_save + std::to_string(i));
                i++;
            }
        });
    }

    return 0;
}
int main( int argc, char* argv[]) {
    GFLAGS_NAMESPACE::SetUsageMessage(
        "\n"
        "RNN Language Model using Stacked LSTMs\n"
        "--------------------------------------\n"
        "\n"
        "Predict next word in sentence using Stacked LSTMs.\n"
        "\n"
        " @author Jonathan Raiman\n"
        " @date February 15th 2015"
    );

    GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
    utils::update_device(FLAGS_device);

    utils::Vocab      word_vocab;
    vector<LanguageBatch<REAL_t>> training;
    vector<LanguageBatch<REAL_t>> validation;

    Timer dl_timer("Dataset loading");
    std::tie(word_vocab, training) = load_dataset_and_vocabulary<REAL_t>(
        FLAGS_train,
        FLAGS_min_occurence,
        FLAGS_minibatch);

    validation = load_dataset_with_vocabulary<REAL_t>(
        FLAGS_validation,
        word_vocab,
        FLAGS_minibatch);
    dl_timer.stop();

    std::cout << "    Vocabulary size = " << word_vocab.size() << " (occuring more than " << FLAGS_min_occurence << ")" << std::endl
              << "Max training epochs = " << FLAGS_epochs           << std::endl
              << "    Training cutoff = " << FLAGS_cutoff           << std::endl
              << "  Number of threads = " << FLAGS_j                << std::endl
              << "     minibatch size = " << FLAGS_minibatch        << std::endl
              << "       max_patience = " << FLAGS_patience         << std::endl
              << "             device = " << (FLAGS_device == -1 ? "cpu" : "gpu") << std::endl;
    pool = new ThreadPool(FLAGS_j);
    shared_ptr<Visualizer> visualizer;

    if (!FLAGS_visualizer.empty()) {
        try {
            visualizer = make_shared<Visualizer>(FLAGS_visualizer, FLAGS_visualizer_hostname, FLAGS_visualizer_port);
        } catch (std::runtime_error e) {
            std::cout << e.what() << std::endl; // could not connect to redis.
        }
    }

     auto model = stacked_model_from_CLI<REAL_t>(
        FLAGS_load,
        word_vocab.size(),
        word_vocab.size(),
        true);

    auto parameters = model.parameters();
    auto solver     = Solver::construct(FLAGS_solver, parameters, (REAL_t) FLAGS_learning_rate);

    // replicate model for each thread:
    vector<StackedModel<REAL_t>> thread_models;
    if (FLAGS_j == 1) {
        thread_models.emplace_back(model, false, false);
    } else {
        for (int i = 0; i < FLAGS_j; ++i)
            thread_models.emplace_back(model, false, true);
    }

    Throttled throttled;
    Throttled throttled_wps;

    int epoch       = 0;
    auto cost       = std::numeric_limits<REAL_t>::infinity();
    double new_cost = 0.0;
    int patience    = 0;

    double average_words_per_second = 0;
    int word_done_in_past_second = 0;

    utils::ThreadAverage avg_error(FLAGS_j);

    while (cost > FLAGS_cutoff && epoch < FLAGS_epochs && patience < FLAGS_patience) {
        std::atomic<int> full_code_size(0);
        auto random_batch_order = utils::random_arange(training.size());

        std::atomic<int> batches_processed(0);

        ReportProgress<double> journalist(utils::MS() << "Training epoch " << epoch, random_batch_order.size());

        for (auto batch_id : random_batch_order) {
            pool->run([&, solver, batch_id]() {
                auto& thread_model = thread_models[ThreadPool::get_thread_number()];
                auto thread_parameters = thread_model.parameters();
                auto& minibatch = training[batch_id];

                auto error = thread_model.masked_predict_cost(
                    minibatch, FLAGS_dropout,
                    1 // sequence forecasting problem - predict target one step ahead
                );
                error.grad();

                graph::backward(); // backpropagate
                solver->step(thread_parameters);

                // word_done_in_past_second += minibatch.total_codes;
                word_done_in_past_second += (minibatch.data.dims(0)-1) * (minibatch.data.dims(1));
                throttled_wps.maybe_run(seconds(1), [&]() {
                    average_words_per_second = 0.5 * average_words_per_second + 0.5 * word_done_in_past_second;
                    word_done_in_past_second = 0;
                });

                if (FLAGS_show_wps) {
                    journalist.tick(++batches_processed, average_words_per_second);
                } else {
                    avg_error.update(error.sum().w(0) / minibatch.total_codes);
                    journalist.tick(++batches_processed, avg_error.average());

                }
                if (FLAGS_show_reconstructions) {
                    throttled.maybe_run(seconds(10), [&]() {
                        // Tell the journalist the news can wait
                        journalist.pause();
                        graph::NoBackprop nb;
                        auto& random_batch = training[utils::randint(0, training.size() - 1)];
                        auto random_example_index = utils::randint(0, random_batch.data.dims(1) - 1);
                        std::cout << random_batch.code_lengths[random_example_index] << std::endl;

                        int priming_size = utils::randint(1, std::min(6, random_batch.code_lengths[random_example_index]));

                        vector<uint> priming;
                        for (int i = 0; i < priming_size; ++i) {
                            priming.push_back(random_batch.data.w(i, random_example_index));
                        }

                        auto beams = the_beam_search(model, word_vocab, &priming);

                        vector<uint> priming_no_start(priming.begin() + 1, priming.end());

                        std::cout << "Reconstructions: " << std::endl;
                        for (auto& beam : beams) {
                            std::cout << "=> (" << std::setprecision( 5 ) << beam.score << ") ";
                            std::cout << utils::join(word_vocab.decode(&priming_no_start), " ") << " ";
                            std::cout << utils::bold;
                            std::cout << utils::join(word_vocab.decode(&beam.solution, true), " ") << std::endl;
                            std::cout << utils::reset_color << std::endl;
                        }

                        if (visualizer != nullptr) {
                            vector<vector<string>> sentences;
                            vector<REAL_t>         probs;
                            for (auto& beam : beams) {
                                sentences.emplace_back(word_vocab.decode(&beam.solution, true));
                                probs.emplace_back(beam.score);
                            }

                            auto input_sentence = make_shared<Sentence<REAL_t>>(
                                    word_vocab.decode(&priming_no_start));
                            auto sentences_viz = make_shared<Sentences<REAL_t>>(sentences);
                            sentences_viz->set_weights(probs);

                            auto input_output_pair = GridLayout();

                            input_output_pair.add_in_column(0, input_sentence);
                            input_output_pair.add_in_column(1, sentences_viz);

                            visualizer->feed(input_output_pair.to_json());
                        }

                        journalist.resume();

                    });
                }
            });
        }

        pool->wait_until_idle();
        journalist.done();

        new_cost = average_error(model, validation);
        if (new_cost >= cost) {
            patience += 1;
        } else {
            patience = 0;
        }
        cost = new_cost;
        std::cout << "epoch (" << epoch << ") KL error = "
                  << std::setprecision(3) << std::fixed
                  << std::setw(5) << std::setfill(' ') << new_cost
                  << " patience = " << patience << std::endl;
        maybe_save_model(&model);

        Timer::report();

        epoch++;
    }

    return 0;
}