Exemplo n.º 1
0
void BilingualModel::train(const string& src_file, const string& trg_file, bool initialize) {
    std::cout << "Training files: " << src_file << ", " << trg_file << std::endl;

    if (initialize) {
        if (config->verbose)
            std::cout << "Creating new model" << std::endl;

        src_model.readVocab(src_file);
        trg_model.readVocab(trg_file);
        src_model.initNet();
        trg_model.initNet();
    } else {
        // TODO: check that initialization is fine
    }

    words_processed = 0;
    alpha = config->learning_rate;

    // read files to find out the beginning of each chunk
    auto src_chunks = src_model.chunkify(src_file, config->threads);
    auto trg_chunks = trg_model.chunkify(trg_file, config->threads);

    high_resolution_clock::time_point start = high_resolution_clock::now();
    if (config->threads == 1) {
        trainChunk(src_file, trg_file, src_chunks, trg_chunks, 0);
    } else {
        vector<thread> threads;

        for (int i = 0; i < config->threads; ++i) {
            threads.push_back(thread(&BilingualModel::trainChunk, this,
                src_file, trg_file, src_chunks, trg_chunks, i));
        }

        for (auto it = threads.begin(); it != threads.end(); ++it) {
            it->join();
        }
    }
    high_resolution_clock::time_point end = high_resolution_clock::now();
    auto duration = duration_cast<microseconds>(end - start).count();

    if (config->verbose)
        std::cout << std::endl;

    std::cout << "Training time: " << static_cast<float>(duration) / 1000000 << std::endl;
}
Exemplo n.º 2
0
void MonolingualModel::train(const string& training_file) {
    cout << "MultiVec-mono" << endl;
    config.print();
    cout << "Training file: " << training_file << endl;

    if (!config.freeze) {
        // reads vocab and counts words
        readVocab(training_file); // TODO: incremental training
        if (config.verbose)
            cout << "Total number of words: " << training_words << endl;
    }

    words_processed = 0;
    alpha = config.starting_alpha;

    // read file to find out the beginning of each chunk
    auto chunks = chunkify(training_file, config.n_threads); // also counts the number of lines
    if (!config.freeze)
        initNet();
    if (config.sent_vector)
        initSentWeights();

    high_resolution_clock::time_point start = high_resolution_clock::now();
    if (config.n_threads == 1) {
        trainChunk(training_file, chunks, 0);
    } else {
        vector<thread> threads;

        for (int i = 0; i < config.n_threads; ++i) {
            threads.push_back(thread(&MonolingualModel::trainChunk, this,
                training_file, chunks, i));
        }

        for (auto it = threads.begin(); it != threads.end(); ++it) {
            it->join();
        }
    }
    high_resolution_clock::time_point end = high_resolution_clock::now();
    auto duration = duration_cast<microseconds>(end - start).count();

    if (config.verbose)
        cout << endl;

    cout << "Training time: " << static_cast<float>(duration) / 1000000 << endl;
}