int main( int argc, char* argv[]) { GFLAGS_NAMESPACE::SetUsageMessage( "\n" "Sentiment Analysis as Competition amongst Language Models\n" "---------------------------------------------------------\n" "\n" "We present a dual formulation of the word sequence classification\n" "task: we treat each label’s examples as originating from different\n" "languages and we train language models for each label; at test\n" "time we compare the likelihood of a sequence under each label’s\n" "language model to find the most likely assignment.\n" "\n" " @author Jonathan Raiman\n" " @date February 13th 2015" ); GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); auto epochs = FLAGS_epochs; auto sentiment_treebank = SST::load(FLAGS_train); auto word_vocab = SST::get_vocabulary(sentiment_treebank, FLAGS_min_occurence); auto vocab_size = word_vocab.size(); // Load Dataset of Trees: std::cout << "Unique Treees Loaded : " << sentiment_treebank.size() << std::endl << " Example tree : " << *sentiment_treebank[sentiment_treebank.size()-1] << std::endl << " Vocabulary size : " << vocab_size << std::endl; // Put trees into matrices: const int NUM_SENTIMENTS = 5; vector<vector<Databatch>> datasets(NUM_SENTIMENTS); vector<vector<Databatch>> validation_sets(NUM_SENTIMENTS); { vector<tokenized_uint_labeled_dataset> tree_types(NUM_SENTIMENTS); vector<tokenized_uint_labeled_dataset> validation_tree_types(NUM_SENTIMENTS); for (auto& tree : sentiment_treebank) { if (((int) tree->label) > 4) utils::exit_with_message("Error: One of the trees has a label other than 0-4"); tree_types[tree->label].emplace_back(tree->to_labeled_pair()); for (auto& child : tree->general_children) { if (((int)child->label) > 4) utils::exit_with_message("Error: One of the trees's children has a label other than 0-4"); tree_types[(int) child->label].emplace_back(child->to_labeled_pair()); } } auto validation_treebank = SST::load(FLAGS_validation); for (auto& tree : validation_treebank) { if (((int) tree->label) > 4) utils::exit_with_message("Error: One of the trees has a label other than 0-4"); validation_tree_types[tree->label].emplace_back(tree->to_labeled_pair()); for (auto& child : tree->general_children) { if (((int)child->label) > 4) utils::exit_with_message("Error: One of the trees's children has a label other than 0-4"); validation_tree_types[(int) child->label].emplace_back(child->to_labeled_pair()); } } int i = 0; for (auto& tree_type : tree_types) std::cout << "Label type " << i++ << " has " << tree_type.size() << " different examples" << std::endl; i = 0; for (auto& tree_type : validation_tree_types) { std::cout << "Label type " << i++ << " has " << tree_type.size() << " validation examples" << std::endl; } i = 0; for (auto& tree_type : tree_types) { datasets[i++] = Databatch::create_dataset(tree_type, word_vocab, FLAGS_minibatch, true); } i = 0; for (auto& tree_type : validation_tree_types) validation_sets[i++] = Databatch::create_dataset(tree_type, word_vocab, FLAGS_minibatch, true); } std::cout << " Max training epochs = " << FLAGS_epochs << std::endl; std::cout << " Training cutoff = " << FLAGS_cutoff << std::endl; std::cout << "Minibatches/label/x-val = " << FLAGS_epoch_batches << std::endl; #ifdef USE_GATES std::cout << " using gated model = true" << std::endl; #else std::cout << " using gated model = false" << std::endl; #endif std::cout << " Use Shortcut LSTMs = " << (FLAGS_shortcut ? "true" : "false") << std::endl; std::cout << " Comparing models using = " << (FLAGS_use_surprise ? "surprise" : "log likelihood") << std::endl; pool = new ThreadPool(FLAGS_j); int patience = 0; // with a rampup model we start with zero memory penalty and gradually increase the memory // L1 penalty until it reaches the desired level. // this allows early exploration, but only later forces sparsity on the model std::vector<MODEL_USED<REAL_t>> models; vector<vector<MODEL_USED<REAL_t>>> thread_models; vector<Solver::Adam<REAL_t>> solvers; for (int sentiment = 0; sentiment < NUM_SENTIMENTS; sentiment++) { if (!FLAGS_load.empty()) { std::cout << "Loading model : \"" << FLAGS_load << sentiment << "\"" << std::endl; models.emplace_back(MODEL_USED<REAL_t>::load(FLAGS_load + std::to_string(sentiment))); } else { models.emplace_back( word_vocab.size(), FLAGS_input_size, FLAGS_hidden, FLAGS_stack_size < 1 ? 1 : FLAGS_stack_size, word_vocab.size(), FLAGS_shortcut, FLAGS_memory_feeds_gates ); } thread_models.emplace_back(); for (int thread_no = 0; thread_no < FLAGS_j; ++thread_no) { thread_models[sentiment].push_back(models[sentiment].shallow_copy()); } auto params = models[sentiment].parameters(); //solvers.emplace_back(params, FLAGS_rho); solvers.emplace_back(params, 0.1, 0.001, 1e-9, 5.0); } int epoch = 0; REAL_t accuracy = 0.0; REAL_t new_accuracy; Throttled t; while (accuracy < FLAGS_cutoff && patience < FLAGS_patience) { stringstream ss; ss << "Epoch " << ++epoch; atomic<int> batches_processed(0); ReportProgress<double> journalist(ss.str(), NUM_SENTIMENTS * FLAGS_epoch_batches); for (int sentiment = 0; sentiment < NUM_SENTIMENTS; sentiment++) { for (int batch_id = 0; batch_id < FLAGS_epoch_batches; ++batch_id) { pool->run([&thread_models, &journalist, &solvers, &datasets, sentiment, &epoch, &accuracy, &batches_processed]() { auto& thread_model = thread_models[sentiment][ThreadPool::get_thread_number()]; auto& solver = solvers[sentiment]; auto thread_parameters = thread_model.parameters(); auto& minibatch = datasets[sentiment][utils::randint(0, datasets[sentiment].size()-1)]; #ifdef USE_GATES thread_model.memory_penalty = (FLAGS_memory_penalty / minibatch.data->cols()) * std::min((REAL_t)1.0, ((REAL_t) (epoch*epoch) / ((REAL_t) FLAGS_memory_rampup * FLAGS_memory_rampup))); #endif thread_model.masked_predict_cost( minibatch.data, minibatch.data, minibatch.mask, FLAGS_dropout, 1, 0); graph::backward(); // backpropagate solver.step(thread_parameters); // One step of gradient descent journalist.tick(++batches_processed, accuracy); }); } } while(true) { journalist.pause(); reconstruct_random_beams(models, datasets, word_vocab, utils::randint(2, 6), // how many elements to use as a primer for beam FLAGS_num_reconstructions, // how many beams 20 // max size of a sequence ); journalist.resume(); // TODO(jonathan): reconstructions go here.. if (pool->wait_until_idle(seconds(20))) break; } journalist.done(); new_accuracy = average_error(models, validation_sets); if (new_accuracy < accuracy) { patience +=1; } else { patience = 0; } accuracy = new_accuracy; t.maybe_run(seconds(600), [&models]() { int i = 0; for (auto& model : models) { model.save(FLAGS_save + std::to_string(i)); i++; } }); } return 0; }
int main( int argc, char* argv[]) { GFLAGS_NAMESPACE::SetUsageMessage( "\n" "RNN Language Model using Stacked LSTMs\n" "--------------------------------------\n" "\n" "Predict next word in sentence using Stacked LSTMs.\n" "\n" " @author Jonathan Raiman\n" " @date February 15th 2015" ); GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); utils::update_device(FLAGS_device); utils::Vocab word_vocab; vector<LanguageBatch<REAL_t>> training; vector<LanguageBatch<REAL_t>> validation; Timer dl_timer("Dataset loading"); std::tie(word_vocab, training) = load_dataset_and_vocabulary<REAL_t>( FLAGS_train, FLAGS_min_occurence, FLAGS_minibatch); validation = load_dataset_with_vocabulary<REAL_t>( FLAGS_validation, word_vocab, FLAGS_minibatch); dl_timer.stop(); std::cout << " Vocabulary size = " << word_vocab.size() << " (occuring more than " << FLAGS_min_occurence << ")" << std::endl << "Max training epochs = " << FLAGS_epochs << std::endl << " Training cutoff = " << FLAGS_cutoff << std::endl << " Number of threads = " << FLAGS_j << std::endl << " minibatch size = " << FLAGS_minibatch << std::endl << " max_patience = " << FLAGS_patience << std::endl << " device = " << (FLAGS_device == -1 ? "cpu" : "gpu") << std::endl; pool = new ThreadPool(FLAGS_j); shared_ptr<Visualizer> visualizer; if (!FLAGS_visualizer.empty()) { try { visualizer = make_shared<Visualizer>(FLAGS_visualizer, FLAGS_visualizer_hostname, FLAGS_visualizer_port); } catch (std::runtime_error e) { std::cout << e.what() << std::endl; // could not connect to redis. } } auto model = stacked_model_from_CLI<REAL_t>( FLAGS_load, word_vocab.size(), word_vocab.size(), true); auto parameters = model.parameters(); auto solver = Solver::construct(FLAGS_solver, parameters, (REAL_t) FLAGS_learning_rate); // replicate model for each thread: vector<StackedModel<REAL_t>> thread_models; if (FLAGS_j == 1) { thread_models.emplace_back(model, false, false); } else { for (int i = 0; i < FLAGS_j; ++i) thread_models.emplace_back(model, false, true); } Throttled throttled; Throttled throttled_wps; int epoch = 0; auto cost = std::numeric_limits<REAL_t>::infinity(); double new_cost = 0.0; int patience = 0; double average_words_per_second = 0; int word_done_in_past_second = 0; utils::ThreadAverage avg_error(FLAGS_j); while (cost > FLAGS_cutoff && epoch < FLAGS_epochs && patience < FLAGS_patience) { std::atomic<int> full_code_size(0); auto random_batch_order = utils::random_arange(training.size()); std::atomic<int> batches_processed(0); ReportProgress<double> journalist(utils::MS() << "Training epoch " << epoch, random_batch_order.size()); for (auto batch_id : random_batch_order) { pool->run([&, solver, batch_id]() { auto& thread_model = thread_models[ThreadPool::get_thread_number()]; auto thread_parameters = thread_model.parameters(); auto& minibatch = training[batch_id]; auto error = thread_model.masked_predict_cost( minibatch, FLAGS_dropout, 1 // sequence forecasting problem - predict target one step ahead ); error.grad(); graph::backward(); // backpropagate solver->step(thread_parameters); // word_done_in_past_second += minibatch.total_codes; word_done_in_past_second += (minibatch.data.dims(0)-1) * (minibatch.data.dims(1)); throttled_wps.maybe_run(seconds(1), [&]() { average_words_per_second = 0.5 * average_words_per_second + 0.5 * word_done_in_past_second; word_done_in_past_second = 0; }); if (FLAGS_show_wps) { journalist.tick(++batches_processed, average_words_per_second); } else { avg_error.update(error.sum().w(0) / minibatch.total_codes); journalist.tick(++batches_processed, avg_error.average()); } if (FLAGS_show_reconstructions) { throttled.maybe_run(seconds(10), [&]() { // Tell the journalist the news can wait journalist.pause(); graph::NoBackprop nb; auto& random_batch = training[utils::randint(0, training.size() - 1)]; auto random_example_index = utils::randint(0, random_batch.data.dims(1) - 1); std::cout << random_batch.code_lengths[random_example_index] << std::endl; int priming_size = utils::randint(1, std::min(6, random_batch.code_lengths[random_example_index])); vector<uint> priming; for (int i = 0; i < priming_size; ++i) { priming.push_back(random_batch.data.w(i, random_example_index)); } auto beams = the_beam_search(model, word_vocab, &priming); vector<uint> priming_no_start(priming.begin() + 1, priming.end()); std::cout << "Reconstructions: " << std::endl; for (auto& beam : beams) { std::cout << "=> (" << std::setprecision( 5 ) << beam.score << ") "; std::cout << utils::join(word_vocab.decode(&priming_no_start), " ") << " "; std::cout << utils::bold; std::cout << utils::join(word_vocab.decode(&beam.solution, true), " ") << std::endl; std::cout << utils::reset_color << std::endl; } if (visualizer != nullptr) { vector<vector<string>> sentences; vector<REAL_t> probs; for (auto& beam : beams) { sentences.emplace_back(word_vocab.decode(&beam.solution, true)); probs.emplace_back(beam.score); } auto input_sentence = make_shared<Sentence<REAL_t>>( word_vocab.decode(&priming_no_start)); auto sentences_viz = make_shared<Sentences<REAL_t>>(sentences); sentences_viz->set_weights(probs); auto input_output_pair = GridLayout(); input_output_pair.add_in_column(0, input_sentence); input_output_pair.add_in_column(1, sentences_viz); visualizer->feed(input_output_pair.to_json()); } journalist.resume(); }); } }); } pool->wait_until_idle(); journalist.done(); new_cost = average_error(model, validation); if (new_cost >= cost) { patience += 1; } else { patience = 0; } cost = new_cost; std::cout << "epoch (" << epoch << ") KL error = " << std::setprecision(3) << std::fixed << std::setw(5) << std::setfill(' ') << new_cost << " patience = " << patience << std::endl; maybe_save_model(&model); Timer::report(); epoch++; } return 0; }