int main(int argc, char* argv[]) { std::string textfile; int vecsize = 0; int epochs = 0; int context = 0; int negsamples = 0; std::string sinkfile; if (argc < 2) { print_usage(); } for (int i = 1; i < argc; ++i) { switch (argv[i][1]) { case 's': textfile = argv[++i]; break; case 'w': vecsize = std::atoi(argv[++i]); break; case 'o': sinkfile = argv[++i]; break; case 'e': epochs = std::atoi(argv[++i]); break; case 'c': context = std::atoi(argv[++i]); break; case 'n': negsamples = std::atoi(argv[++i]); break; default: print_usage(); std::exit(-1); } } std::cout << "source text file : " << textfile << std::endl << "vector size : " << vecsize << std::endl << "model file : " << sinkfile << std::endl << "#epochs : " << epochs << std::endl << "context size : " << context << std::endl << "negative samples : " << negsamples << std::endl; std::vector<Sentence> corpus; std::map<std::string, size_t> word2id; std::map<size_t, std::string> id2word; Eigen::RowVectorXf unidist; LoadData(textfile, corpus); BuildLexicon(word2id, id2word, corpus); BuildWordDist(unidist, word2id, corpus); int wordcounts = word2id.size(); std::cout << "#words : " << wordcounts << std::endl; srand(0); Eigen::MatrixXf innerVector, outerVector; innerVector.resize(wordcounts, vecsize); outerVector.resize(wordcounts, vecsize); innerVector.setRandom(); outerVector.setRandom(); std::cout << "inner " << std::endl; std::cout << innerVector << std::endl; std::cout << "outer " << std::endl; std::cout << outerVector << std::endl; std::uniform_real_distribution<double> wordselector(0.0, 1.0); std::default_random_engine engine(0); // training with word embeddings. for (int i = 0; i < epochs; ++i) { for (Sentence& sent : corpus) { for (int idx = 0; idx < sent.length(); ++idx) { size_t leftIdx = Sentence::GetPos(sent.length(), idx, -context); size_t rightIdx = Sentence::GetPos(sent.length(), idx, context); std::vector<size_t> posindx; std::vector<size_t> negindx; for (size_t contextidx = leftIdx; contextidx <= rightIdx; ++contextidx) { if (contextidx == idx) continue; posindx.push_back(word2id[sent.word(contextidx)]); } for (int negcnt = 0; negcnt < negsamples; ++ negcnt) { size_t selwordidx = 0; do { double randomvalue = wordselector(engine); selwordidx = sampling(unidist, randomvalue); bool realneg = true; for (size_t posidx : posindx) { if (posidx == selwordidx) realneg = false; } for (size_t negidx : negindx) { if (negidx == selwordidx) realneg = false; } if (realneg) break; } while (true); negindx.push_back(selwordidx); } Eigen::MatrixXf posoutervec(posindx.size(), vecsize); Eigen::MatrixXf posgrad(posindx.size(), vecsize); Eigen::MatrixXf negoutervec(negindx.size(), vecsize); Eigen::MatrixXf neggrad(negindx.size(), vecsize); // update the information for (int posIdx = 0; posIdx < posindx.size(); ++posIdx) { posoutervec.row(posIdx) = outerVector.row(posindx[posIdx]); } for (int negIdx = 0; negIdx < negindx.size(); ++negIdx) { negoutervec.row(negIdx) = outerVector.row(negindx[negIdx]); } // initialize the vector Eigen::RowVectorXf centerInnervec = innerVector.row(word2id[sent.word(idx)]); Eigen::RowVectorXf posrawscore = posoutervec * centerInnervec.transpose(); Eigen::RowVectorXf negrawscore = negoutervec * centerInnervec.transpose(); double expsum = posrawscore.array().exp().sum() + negrawscore.array().exp().sum(); Eigen::RowVectorXf posprob = posrawscore.array().exp() / expsum; Eigen::RowVectorXf negprob = negrawscore.array().exp() / expsum; // calculate the gradient of center word inner vector. Eigen::RowVectorXf centerInnergrad = posoutervec.colwise().sum() - posprob * posoutervec; centerInnergrad *= -1; // calculate the gradient of surrounding word outer vector for (size_t posindx = 0; posindx < posgrad.rows(); ++posindx) { posgrad.row(posindx) = -1 * (centerInnervec - posprob[posindx] * posgrad.rows() * centerInnervec); } // calculate the negative samples for (size_t negidx = 0; negidx < neggrad.rows(); ++negidx) { neggrad.row(negidx) = negprob[negidx] * centerInnervec; } // update the gradient size_t centerWordId = word2id[sent.word(idx)]; innerVector.row(centerWordId) -= 0.1 * centerInnergrad; for (size_t posIdx = 0; posIdx < posindx.size(); ++posIdx) { size_t wordId = posindx[posIdx]; outerVector.row(wordId) -= posgrad.row(posIdx); } for (size_t negIdx = 0; negIdx < negindx.size(); ++negIdx) { size_t wordId = negindx[negIdx]; outerVector.row(wordId) -= neggrad.row(negIdx); } std::cout << "Update Vector : " << std::endl; std::cout << "inner " << std::endl; std::cout << innerVector << std::endl; std::cout << "outer " << std::endl; std::cout << outerVector << std::endl; } } } saveModel(sinkfile, word2id, innerVector, outerVector); return 0; }
SEXP port_nlsb(SEXP m, SEXP d, SEXP gg, SEXP iv, SEXP v, SEXP lowerb, SEXP upperb) { int *dims = INTEGER(getAttrib(gg, R_DimSymbol)); int i, n = LENGTH(d), p = LENGTH(d), nd = dims[0]; SEXP getPars, setPars, resid, gradient, rr = PROTECT(allocVector(REALSXP, nd)), x = PROTECT(allocVector(REALSXP, n)); // This used to use Calloc, but that will leak if // there is a premature return (and did in package drfit) double *b = (double *) NULL, *rd = (double *)R_alloc(nd, sizeof(double)); if (!isReal(d) || n < 1) error(_("'d' must be a nonempty numeric vector")); if(!isNewList(m)) error(_("m must be a list")); /* Initialize parameter vector */ getPars = PROTECT(lang1(getFunc(m, "getPars", "m"))); eval_check_store(getPars, R_GlobalEnv, x); /* Create the setPars call */ setPars = PROTECT(lang2(getFunc(m, "setPars", "m"), x)); /* Evaluate residual and gradient */ resid = PROTECT(lang1(getFunc(m, "resid", "m"))); eval_check_store(resid, R_GlobalEnv, rr); gradient = PROTECT(lang1(getFunc(m, "gradient", "m"))); neggrad(gradient, R_GlobalEnv, gg); if ((LENGTH(lowerb) == n) && (LENGTH(upperb) == n)) { if (isReal(lowerb) && isReal(upperb)) { double *rl = REAL(lowerb), *ru = REAL(upperb); b = (double *)R_alloc(2*n, sizeof(double)); for (i = 0; i < n; i++) { b[2*i] = rl[i]; b[2*i + 1] = ru[i]; } } else error(_("'lowerb' and 'upperb' must be numeric vectors")); } do { nlsb_iterate(b, REAL(d), REAL(gg), INTEGER(iv), LENGTH(iv), LENGTH(v), n, nd, p, REAL(rr), rd, REAL(v), REAL(x)); switch(INTEGER(iv)[0]) { case -3: eval(setPars, R_GlobalEnv); eval_check_store(resid, R_GlobalEnv, rr); neggrad(gradient, R_GlobalEnv, gg); break; case -2: eval_check_store(resid, R_GlobalEnv, rr); neggrad(gradient, R_GlobalEnv, gg); break; case -1: eval(setPars, R_GlobalEnv); eval_check_store(resid, R_GlobalEnv, rr); neggrad(gradient, R_GlobalEnv, gg); break; case 0: Rprintf("nlsb_iterate returned %d", INTEGER(iv)[0]); break; case 1: eval(setPars, R_GlobalEnv); eval_check_store(resid, R_GlobalEnv, rr); break; case 2: eval(setPars, R_GlobalEnv); neggrad(gradient, R_GlobalEnv, gg); break; } } while(INTEGER(iv)[0] < 3); UNPROTECT(6); return R_NilValue; }