int parse_sentence(const string& sentence, const Vocabulary& vocab, real subsample_thres, unsigned* p_seed, vector<uint64_t>* words) { istringstream iss(sentence); uint64_t total_cnt = vocab.total_cnt(); int word_cnt = 0; string word; while (iss >> word) { uint64_t word_id; if (!vocab.find_word_id(word, &word_id)) { continue; } ++word_cnt; if (subsample_thres > 0) { double t = subsample_thres * total_cnt / vocab.get_word_cnt(word_id); double remain_prob = (sqrt(1 / t) + 1) * t; // not the same as the paper, which is sqrt(t) if (remain_prob < static_cast<real>(rand_r(p_seed)) / RAND_MAX) { continue; } } words->push_back(word_id); } return word_cnt; }
int main(int argc, char **argv) { uint64_t hidden_layer_size = 100; int min_count = 5; TrainPara train_para; string save_vocab_file; string read_vocab_file; string train_file; string vector_file; if (argc < 3) { cerr << usage << endl; return -1; } train_file = argv[argc - 2]; vector_file = argv[argc - 1]; for (int i = 1; i < argc - 2; i += 2) { string arg = argv[i]; const char* val = argv[i + 1]; if (arg == "-size") { hidden_layer_size = atoi(val); } else if (arg == "-type") { if (string(val) == "cbow") { train_para.type = CBOW; } else if (string(val) == "skip-gram") { train_para.type = SKIP_GRAM; } else { cerr << "unknown -type: " << val << endl;; return -1; } } else if (arg == "-algo") { if (string(val) == "ns") { train_para.algo = NEG_SAMPLING; } else if (string(val) == "hs") { train_para.algo = HIER_SOFTMAX; } else { cerr << "unknown -algo: " << val << endl;; return -1; } } else if (arg == "-neg-sample") { train_para.neg_sample_cnt = atoi(val); } else if (arg == "-window") { train_para.window_size = atoi(val); } else if (arg == "-subsample") { train_para.subsample_thres = atof(val); } else if (arg == "-thread") { train_para.thread_cnt = atoi(val); } else if (arg == "-iter") { train_para.iter_cnt = atoi(val); } else if (arg == "-min-count") { min_count = atoi(val); } else if (arg == "-alpha") { train_para.alpha = atof(val); } else if (arg == "-save-vocab") { save_vocab_file = val; } else if (arg == "-read-vocab") { read_vocab_file = val; } else { cerr << "unknow argument: '" << arg << "'" << endl; return -1; } } if (train_para.alpha < 0) { if (train_para.type == CBOW) { train_para.alpha = 0.05; } else { train_para.alpha = 0.025; } } cerr << "parameters:" << endl << "size = " << hidden_layer_size << endl << "type = " << ((train_para.type==CBOW)?"cbow":"skip-gram") << endl << "algo = " << ((train_para.algo==HIER_SOFTMAX)?"hs":"neg sampling") << endl << "neg sampling cnt = " << train_para.neg_sample_cnt << endl << "window = " << train_para.window_size << endl << "subsample thres = " << train_para.subsample_thres << endl << "thread = " << train_para.thread_cnt << endl << "iter = " << train_para.iter_cnt << endl << "min count = " << min_count << endl << "alpha = " << train_para.alpha << endl << "save vocab = " << save_vocab_file << endl << "read vocab = " << read_vocab_file << endl << "training file = " << train_file << endl << "word vector file = " << vector_file << endl << endl; print_log("start ..."); ifstream ifs_train(train_file.c_str()); if (!ifs_train) { cerr << "can't open: " << train_file << endl; return -1; } Vocabulary vocab; HuffmanTree* huffman_tree = NULL; vocab.parse(ifs_train, min_count); cerr << "vocab size = " << vocab.size() << ", total words count = " << vocab.total_cnt() << endl; print_log("calc vocab finished ..."); ifs_train.close(); if (!save_vocab_file.empty()) { ofstream ofs_vocab(save_vocab_file.c_str()); if (!ofs_vocab) { cerr << "can't write to " << save_vocab_file << endl; return -1; } vocab.save(ofs_vocab); print_log("save vocab finished ..."); } if (train_para.algo == NEG_SAMPLING) { vocab.init_sampling_table(); print_log("init sampling table finished ..."); } else if (train_para.algo == HIER_SOFTMAX) { huffman_tree = new HuffmanTree(vocab.vocab()); print_log("grow huffman tree finished ..."); } Net net(vocab.size(), hidden_layer_size); print_log("net init finished ..."); if (!train(train_file, vocab, *huffman_tree, net, train_para)) { cerr << "training failed" << endl; return -1; } print_log("training finished ..."); ofstream ofs_result(vector_file.c_str()); if (!ofs_result) { cerr << "can't write to " << vector_file << endl; return -1; } save_word_vec(ofs_result, net, vocab); ofs_result.close(); print_log("saving word vector finished ..."); delete huffman_tree; }