void save_word_vec(ostream& os, const Net& net, const Vocabulary& vocab) { size_t sz = net.hidden_layer_size(); const vector<Word>& words = vocab.vocab(); os << words.size() << " " << sz << endl; for (size_t i = 0; i != words.size(); ++i) { os << words[i].word; const real* v = net.get_input_vec(i); for (size_t j = 0; j != sz; ++j) { os << " " << v[j]; } os << endl; } }
int main(int argc, char **argv) { uint64_t hidden_layer_size = 100; int min_count = 5; TrainPara train_para; string save_vocab_file; string read_vocab_file; string train_file; string vector_file; if (argc < 3) { cerr << usage << endl; return -1; } train_file = argv[argc - 2]; vector_file = argv[argc - 1]; for (int i = 1; i < argc - 2; i += 2) { string arg = argv[i]; const char* val = argv[i + 1]; if (arg == "-size") { hidden_layer_size = atoi(val); } else if (arg == "-type") { if (string(val) == "cbow") { train_para.type = CBOW; } else if (string(val) == "skip-gram") { train_para.type = SKIP_GRAM; } else { cerr << "unknown -type: " << val << endl;; return -1; } } else if (arg == "-algo") { if (string(val) == "ns") { train_para.algo = NEG_SAMPLING; } else if (string(val) == "hs") { train_para.algo = HIER_SOFTMAX; } else { cerr << "unknown -algo: " << val << endl;; return -1; } } else if (arg == "-neg-sample") { train_para.neg_sample_cnt = atoi(val); } else if (arg == "-window") { train_para.window_size = atoi(val); } else if (arg == "-subsample") { train_para.subsample_thres = atof(val); } else if (arg == "-thread") { train_para.thread_cnt = atoi(val); } else if (arg == "-iter") { train_para.iter_cnt = atoi(val); } else if (arg == "-min-count") { min_count = atoi(val); } else if (arg == "-alpha") { train_para.alpha = atof(val); } else if (arg == "-save-vocab") { save_vocab_file = val; } else if (arg == "-read-vocab") { read_vocab_file = val; } else { cerr << "unknow argument: '" << arg << "'" << endl; return -1; } } if (train_para.alpha < 0) { if (train_para.type == CBOW) { train_para.alpha = 0.05; } else { train_para.alpha = 0.025; } } cerr << "parameters:" << endl << "size = " << hidden_layer_size << endl << "type = " << ((train_para.type==CBOW)?"cbow":"skip-gram") << endl << "algo = " << ((train_para.algo==HIER_SOFTMAX)?"hs":"neg sampling") << endl << "neg sampling cnt = " << train_para.neg_sample_cnt << endl << "window = " << train_para.window_size << endl << "subsample thres = " << train_para.subsample_thres << endl << "thread = " << train_para.thread_cnt << endl << "iter = " << train_para.iter_cnt << endl << "min count = " << min_count << endl << "alpha = " << train_para.alpha << endl << "save vocab = " << save_vocab_file << endl << "read vocab = " << read_vocab_file << endl << "training file = " << train_file << endl << "word vector file = " << vector_file << endl << endl; print_log("start ..."); ifstream ifs_train(train_file.c_str()); if (!ifs_train) { cerr << "can't open: " << train_file << endl; return -1; } Vocabulary vocab; HuffmanTree* huffman_tree = NULL; vocab.parse(ifs_train, min_count); cerr << "vocab size = " << vocab.size() << ", total words count = " << vocab.total_cnt() << endl; print_log("calc vocab finished ..."); ifs_train.close(); if (!save_vocab_file.empty()) { ofstream ofs_vocab(save_vocab_file.c_str()); if (!ofs_vocab) { cerr << "can't write to " << save_vocab_file << endl; return -1; } vocab.save(ofs_vocab); print_log("save vocab finished ..."); } if (train_para.algo == NEG_SAMPLING) { vocab.init_sampling_table(); print_log("init sampling table finished ..."); } else if (train_para.algo == HIER_SOFTMAX) { huffman_tree = new HuffmanTree(vocab.vocab()); print_log("grow huffman tree finished ..."); } Net net(vocab.size(), hidden_layer_size); print_log("net init finished ..."); if (!train(train_file, vocab, *huffman_tree, net, train_para)) { cerr << "training failed" << endl; return -1; } print_log("training finished ..."); ofstream ofs_result(vector_file.c_str()); if (!ofs_result) { cerr << "can't write to " << vector_file << endl; return -1; } save_word_vec(ofs_result, net, vocab); ofs_result.close(); print_log("saving word vector finished ..."); delete huffman_tree; }