예제 #1
0
파일: w2v.cpp 프로젝트: liangkai/DeepNLP
int main(int argc, char* argv[]) {

    std::string textfile;
    int vecsize = 0;
    int epochs = 0;
    int context = 0;
    int negsamples = 0;
    std::string sinkfile;

    if (argc < 2) {
        print_usage();
    }

    for (int i = 1; i < argc; ++i) {
        switch (argv[i][1])
        {
        case 's':
            textfile = argv[++i];
            break;
        case 'w':
            vecsize = std::atoi(argv[++i]);
            break;
        case 'o':
            sinkfile = argv[++i];
            break;
        case 'e':
            epochs = std::atoi(argv[++i]);
            break;
        case 'c':
            context = std::atoi(argv[++i]);
            break;
        case 'n':
            negsamples = std::atoi(argv[++i]);
            break;
        default:
            print_usage();
            std::exit(-1);
        }
    }

    std::cout << "source text file : " << textfile << std::endl
              << "vector size      : " << vecsize << std::endl
              << "model file       : " << sinkfile << std::endl
              << "#epochs          : " << epochs << std::endl
              << "context size     : " << context << std::endl
              << "negative samples : " << negsamples << std::endl;

    std::vector<Sentence> corpus;
    std::map<std::string, size_t> word2id;
    std::map<size_t, std::string> id2word;
    Eigen::RowVectorXf unidist;

    LoadData(textfile, corpus);
    BuildLexicon(word2id, id2word, corpus);
    BuildWordDist(unidist, word2id, corpus);

    int wordcounts = word2id.size();
    std::cout << "#words : " << wordcounts << std::endl;

    srand(0);
    Eigen::MatrixXf innerVector, outerVector;
    innerVector.resize(wordcounts, vecsize);
    outerVector.resize(wordcounts, vecsize);

    innerVector.setRandom();
    outerVector.setRandom();

    std::cout << "inner " << std::endl;
    std::cout << innerVector << std::endl;
    std::cout << "outer " << std::endl;
    std::cout << outerVector << std::endl;

    std::uniform_real_distribution<double> wordselector(0.0, 1.0);
    std::default_random_engine engine(0);

    // training with word embeddings.
    for (int i = 0; i < epochs; ++i) {
        for (Sentence& sent : corpus) {
            for (int idx = 0; idx < sent.length(); ++idx) {
                size_t leftIdx = Sentence::GetPos(sent.length(), idx, -context);
                size_t rightIdx = Sentence::GetPos(sent.length(), idx, context);

                std::vector<size_t> posindx;
                std::vector<size_t> negindx;

                for (size_t contextidx = leftIdx; contextidx <= rightIdx; ++contextidx) {
                    if (contextidx == idx)
                        continue;
                    posindx.push_back(word2id[sent.word(contextidx)]);
                }

                for (int negcnt = 0; negcnt < negsamples; ++ negcnt) {

                    size_t selwordidx = 0;
                    do {
                        double randomvalue = wordselector(engine);
                        selwordidx = sampling(unidist, randomvalue);
                        bool realneg = true;
                        for (size_t posidx : posindx) {
                            if (posidx == selwordidx)
                                realneg = false;
                        }

                        for (size_t negidx : negindx) {
                            if (negidx == selwordidx)
                                realneg = false;
                        }

                        if (realneg)
                            break;
                    } while (true);

                    negindx.push_back(selwordidx);
                }

                Eigen::MatrixXf posoutervec(posindx.size(), vecsize);
                Eigen::MatrixXf posgrad(posindx.size(), vecsize);

                Eigen::MatrixXf negoutervec(negindx.size(), vecsize);
                Eigen::MatrixXf neggrad(negindx.size(), vecsize);

                // update the information
                for (int posIdx = 0; posIdx < posindx.size(); ++posIdx) {
                    posoutervec.row(posIdx) = outerVector.row(posindx[posIdx]);
                }

                for (int negIdx = 0; negIdx < negindx.size(); ++negIdx) {
                    negoutervec.row(negIdx) = outerVector.row(negindx[negIdx]);
                }

                // initialize the vector
                Eigen::RowVectorXf centerInnervec = innerVector.row(word2id[sent.word(idx)]);

                Eigen::RowVectorXf posrawscore = posoutervec * centerInnervec.transpose();
                Eigen::RowVectorXf negrawscore = negoutervec * centerInnervec.transpose();

                double expsum = posrawscore.array().exp().sum() + negrawscore.array().exp().sum();

                Eigen::RowVectorXf posprob = posrawscore.array().exp() / expsum;
                Eigen::RowVectorXf negprob = negrawscore.array().exp() / expsum;

                // calculate the gradient of center word inner vector.
                Eigen::RowVectorXf centerInnergrad = posoutervec.colwise().sum() - posprob * posoutervec;
                centerInnergrad *= -1;

                // calculate the gradient of surrounding word outer vector
                for (size_t posindx = 0; posindx < posgrad.rows(); ++posindx) {
                    posgrad.row(posindx) = -1 * (centerInnervec - posprob[posindx] * posgrad.rows() * centerInnervec);
                }

                // calculate the negative samples
                for (size_t negidx = 0; negidx < neggrad.rows(); ++negidx) {
                    neggrad.row(negidx) = negprob[negidx] * centerInnervec;
                }

                // update the gradient
                size_t centerWordId = word2id[sent.word(idx)];
                innerVector.row(centerWordId) -= 0.1 * centerInnergrad;

                for (size_t posIdx = 0; posIdx < posindx.size(); ++posIdx) {
                    size_t wordId = posindx[posIdx];
                    outerVector.row(wordId) -= posgrad.row(posIdx);
                }

                for (size_t negIdx = 0; negIdx < negindx.size(); ++negIdx) {
                    size_t wordId = negindx[negIdx];
                    outerVector.row(wordId) -= neggrad.row(negIdx);
                }

                std::cout << "Update Vector : " << std::endl;
                std::cout << "inner " << std::endl;
                std::cout << innerVector << std::endl;
                std::cout << "outer " << std::endl;
                std::cout << outerVector << std::endl;

            }
        }
    }

    saveModel(sinkfile, word2id, innerVector, outerVector);
    return 0;
}
예제 #2
0
파일: port.c 프로젝트: Bgods/r-source
SEXP port_nlsb(SEXP m, SEXP d, SEXP gg, SEXP iv, SEXP v,
	       SEXP lowerb, SEXP upperb)
{
    int *dims = INTEGER(getAttrib(gg, R_DimSymbol));
    int i, n = LENGTH(d), p = LENGTH(d), nd = dims[0];
    SEXP getPars, setPars, resid, gradient,
	rr = PROTECT(allocVector(REALSXP, nd)),
	x = PROTECT(allocVector(REALSXP, n));
    // This used to use Calloc, but that will leak if 
    // there is a premature return (and did in package drfit)
    double *b = (double *) NULL,
	*rd = (double *)R_alloc(nd, sizeof(double));

    if (!isReal(d) || n < 1)
	error(_("'d' must be a nonempty numeric vector"));
    if(!isNewList(m)) error(_("m must be a list"));
				/* Initialize parameter vector */
    getPars = PROTECT(lang1(getFunc(m, "getPars", "m")));
    eval_check_store(getPars, R_GlobalEnv, x);
				/* Create the setPars call */
    setPars = PROTECT(lang2(getFunc(m, "setPars", "m"), x));
				/* Evaluate residual and gradient */
    resid = PROTECT(lang1(getFunc(m, "resid", "m")));
    eval_check_store(resid, R_GlobalEnv, rr);
    gradient = PROTECT(lang1(getFunc(m, "gradient", "m")));
    neggrad(gradient, R_GlobalEnv, gg);

    if ((LENGTH(lowerb) == n) && (LENGTH(upperb) == n)) {
	if (isReal(lowerb) && isReal(upperb)) {
	    double *rl = REAL(lowerb), *ru = REAL(upperb);
	    b = (double *)R_alloc(2*n, sizeof(double));
	    for (i = 0; i < n; i++) {
		b[2*i] = rl[i];
		b[2*i + 1] = ru[i];
	    }
	} else error(_("'lowerb' and 'upperb' must be numeric vectors"));
    }

    do {
	nlsb_iterate(b, REAL(d), REAL(gg), INTEGER(iv), LENGTH(iv),
		     LENGTH(v), n, nd, p, REAL(rr), rd,
		     REAL(v), REAL(x));
	switch(INTEGER(iv)[0]) {
	case -3:
	    eval(setPars, R_GlobalEnv);
	    eval_check_store(resid, R_GlobalEnv, rr);
	    neggrad(gradient, R_GlobalEnv, gg);
	    break;
	case -2:
	    eval_check_store(resid, R_GlobalEnv, rr);
	    neggrad(gradient, R_GlobalEnv, gg);
	    break;
	case -1:
	    eval(setPars, R_GlobalEnv);
	    eval_check_store(resid, R_GlobalEnv, rr);
	    neggrad(gradient, R_GlobalEnv, gg);
	    break;
	case 0:
	    Rprintf("nlsb_iterate returned %d", INTEGER(iv)[0]);
	    break;
	case 1:
	    eval(setPars, R_GlobalEnv);
	    eval_check_store(resid, R_GlobalEnv, rr);
	    break;
	case 2:
	    eval(setPars, R_GlobalEnv);
	    neggrad(gradient, R_GlobalEnv, gg);
	    break;
	}
    } while(INTEGER(iv)[0] < 3);

    UNPROTECT(6);
    return R_NilValue;
}