void BilingualModel::train(const string& src_file, const string& trg_file, bool initialize) { std::cout << "Training files: " << src_file << ", " << trg_file << std::endl; if (initialize) { if (config->verbose) std::cout << "Creating new model" << std::endl; src_model.readVocab(src_file); trg_model.readVocab(trg_file); src_model.initNet(); trg_model.initNet(); } else { // TODO: check that initialization is fine } words_processed = 0; alpha = config->learning_rate; // read files to find out the beginning of each chunk auto src_chunks = src_model.chunkify(src_file, config->threads); auto trg_chunks = trg_model.chunkify(trg_file, config->threads); high_resolution_clock::time_point start = high_resolution_clock::now(); if (config->threads == 1) { trainChunk(src_file, trg_file, src_chunks, trg_chunks, 0); } else { vector<thread> threads; for (int i = 0; i < config->threads; ++i) { threads.push_back(thread(&BilingualModel::trainChunk, this, src_file, trg_file, src_chunks, trg_chunks, i)); } for (auto it = threads.begin(); it != threads.end(); ++it) { it->join(); } } high_resolution_clock::time_point end = high_resolution_clock::now(); auto duration = duration_cast<microseconds>(end - start).count(); if (config->verbose) std::cout << std::endl; std::cout << "Training time: " << static_cast<float>(duration) / 1000000 << std::endl; }
void MonolingualModel::train(const string& training_file) { cout << "MultiVec-mono" << endl; config.print(); cout << "Training file: " << training_file << endl; if (!config.freeze) { // reads vocab and counts words readVocab(training_file); // TODO: incremental training if (config.verbose) cout << "Total number of words: " << training_words << endl; } words_processed = 0; alpha = config.starting_alpha; // read file to find out the beginning of each chunk auto chunks = chunkify(training_file, config.n_threads); // also counts the number of lines if (!config.freeze) initNet(); if (config.sent_vector) initSentWeights(); high_resolution_clock::time_point start = high_resolution_clock::now(); if (config.n_threads == 1) { trainChunk(training_file, chunks, 0); } else { vector<thread> threads; for (int i = 0; i < config.n_threads; ++i) { threads.push_back(thread(&MonolingualModel::trainChunk, this, training_file, chunks, i)); } for (auto it = threads.begin(); it != threads.end(); ++it) { it->join(); } } high_resolution_clock::time_point end = high_resolution_clock::now(); auto duration = duration_cast<microseconds>(end - start).count(); if (config.verbose) cout << endl; cout << "Training time: " << static_cast<float>(duration) / 1000000 << endl; }