Exemplo n.º 1
0
template<class ST> void CDenseFeatures<ST>::set_feature_matrix(SGMatrix<ST> matrix)
{
	m_subset_stack->remove_all_subsets();
	free_feature_matrix();
	feature_matrix = matrix;
	num_features = matrix.num_rows;
	num_vectors = matrix.num_cols;
}
Exemplo n.º 2
0
template<class ST> SGMatrix<ST> CDenseFeatures<ST>::steal_feature_matrix()
{
	SGMatrix<ST> st_feature_matrix=feature_matrix;
	m_subset_stack->remove_all_subsets();
	SG_UNREF(feature_cache);
	clean_preprocessors();
	free_feature_matrix();
	return st_feature_matrix;
}
Exemplo n.º 3
0
template<class ST> void CDenseFeatures<ST>::copy_feature_matrix(SGMatrix<ST> src)
{
	if (m_subset_stack->has_subsets())
		SG_ERROR("A subset is set, cannot call copy_feature_matrix\n")

	free_feature_matrix();
	feature_matrix = src.clone();
	num_features = src.num_rows;
	num_vectors = src.num_cols;
	initialize_cache();
}
Exemplo n.º 4
0
template<class ST> void CDenseFeatures<ST>::obtain_from_dot(CDotFeatures* df)
{
	m_subset_stack->remove_all_subsets();

	int32_t num_feat = df->get_dim_feature_space();
	int32_t num_vec = df->get_num_vectors();

	ASSERT(num_feat>0 && num_vec>0)

	free_feature_matrix();
	feature_matrix = SGMatrix<ST>(num_feat, num_vec);

	for (int32_t i = 0; i < num_vec; i++)
	{
		SGVector<float64_t> v = df->get_computed_dot_feature_vector(i);
		ASSERT(num_feat==v.vlen)

		for (int32_t j = 0; j < num_feat; j++)
			feature_matrix.matrix[i * int64_t(num_feat) + j] = (ST) v.vector[j];
	}
	num_features = num_feat;
	num_vectors = num_vec;
}
Exemplo n.º 5
0
template<class ST> void CDenseFeatures<ST>::free_features()
{
	m_subset_stack->remove_all_subsets();
	free_feature_matrix();
	SG_UNREF(feature_cache);
}
Exemplo n.º 6
0
ParserTestMetric test_KernelPerceptronModel(void *mdl, const CoNLLCorpus corpus, bool use_temp_weight, FILE *gold_ofp, FILE *model_ofp) {
    ParserTestMetric metric = create_ParserTestMetric();

    PerceptronModel linear_mdl;
    KernelPerceptron kernel_mdl;
    if (kernel == KLINEAR) {
        linear_mdl = (PerceptronModel) mdl;
    } else {
        kernel_mdl = (KernelPerceptron) mdl;
    }

    for (int si = 0; si < DArray_count(corpus->sentences); si++) {
        FeaturedSentence sent = DArray_get(corpus->sentences, si);

        debug("Test sentence %d (section %d) of length %d", si, sent->section, sent->length);

        if (kernel != KLINEAR) {
                debug("Generating feature matrix for sentence %d", si);
                set_FeatureMatrix(NULL, corpus, si);
        }

        if (kernel == KLINEAR) {
            if (use_temp_weight) {
                debug("\tI will be using a weight vector (raw) of length %ld", linear_mdl->embedding_w_temp->n);
                build_adjacency_matrix(corpus, si, linear_mdl->embedding_w_temp, NULL);
            } else {
                debug("\tI will be using a weight vector (averaged) of length %ld", linear_mdl->embedding_w->n);
                build_adjacency_matrix(corpus, si, linear_mdl->embedding_w, NULL);
            }

        } else {
            debug("Generating adj. matrix for sentence %d", si);
            set_adjacency_matrix_fast(corpus, si, kernel_mdl, true);
        }


        debug("Now parsing sentence %d", si);
        int *model = parse(sent);

        (metric->total_sentence)++;
        debug("Now comparing actual arcs with model generated arcs for sentence %d (Last sentence is %d)", si, sent->length);
        for (int j = 0; j < sent->length; j++) {
            Word w = (Word) DArray_get(sent->words, j);

            w->predicted_parent = model[j + 1];

            // TODO: One file per section idea 
            if (model_ofp != NULL)
                dump_conll_word(w, true, model_ofp);

            if (gold_ofp != NULL)
                dump_conll_word(w, false, gold_ofp);

            if (w->parent == 0 && model[j + 1] == 0)
                (metric->true_root_predicted)++;

            debug("\tTrue parent of word %d (with %s:%s) is %d whereas estimated parent is %d", j + 1, w->postag, w->form, w->parent, model[j + 1]);

            int pmatch_nopunc = 0, ptotal_nopunc = 0, pmatch = 0;
            if (strcmp(w->postag, ",") != 0 && strcmp(w->postag, ":") != 0 && strcmp(w->postag, ".") != 0 && strcmp(w->postag, "``") != 0 && strcmp(w->postag, "''") != 0) {

                if (w->parent == model[j + 1]) {
                    (metric->without_punc->true_prediction)++;
                    pmatch_nopunc++;
                }

                ptotal_nopunc++;

                (metric->without_punc->total_prediction)++;
            }

            if (pmatch_nopunc == ptotal_nopunc && pmatch_nopunc != 0) {
                (metric->complete_sentence_without_punc)++;
            }


            (metric->all->total_prediction)++;

            if (w->parent == model[j + 1]) {
                pmatch++;
                (metric->all->true_prediction)++;
            }

            if (pmatch == sent->length && pmatch != 0)
                (metric->complete_sentence)++;
        }

        if (model_ofp != NULL) {
            fprintf(model_ofp, "\n");
        }

        if (gold_ofp != NULL) {
            fprintf(gold_ofp, "\n");
        }

        free(model);


        debug("Releasing feature matrix for sentence %d", si);

        free_feature_matrix(corpus, si);
    }

    return metric;
}
Exemplo n.º 7
0
void train_once_KernelPerceptronModel(KernelPerceptron mdl, const CoNLLCorpus corpus, int max_rec) {
    long match = 0, total = 0;
    //size_t slen=0;

    double s_initial = dsecnd();
    int max_sv = 0;


    log_info("Total number of training instances %d", (max_rec == -1) ? DArray_count(corpus->sentences) : max_rec);

    for (int si = 0; si < ((max_rec == -1) ? DArray_count(corpus->sentences) : max_rec); si++) {

        FeaturedSentence sent = (FeaturedSentence) DArray_get(corpus->sentences, si);

        debug("Building feature matrix for sentence %d", si);
        set_FeatureMatrix(NULL, corpus, si);

        set_adjacency_matrix_fast(corpus, si, mdl, false);

        max_sv += (sent->length + 1) * sent->length - sent->length;

        int *model = parse(sent);

        //printfarch(model, sent->length);
        debug("Parsing sentence %d of length %d is done", si, sent->length);
        int *empirical = get_parents(sent);

        //printfarch(empirical, sent->length);
        int nm = nmatch(model, empirical, sent->length);

        debug("Model matches %d arcs out of %d arcs", nm, sent->length);
        if (nm != sent->length) { // root has no valid parent.
            log_info("Sentence %d (section %d) of length %d (%d arcs out of %d arcs are correct)", si, sent->section, sent->length, nm, sent->length);

            int sentence_length = sent->length;
            for (int to = 1; to <= sentence_length; to++) {

                if (model[to] != empirical[to]) {

                    update_alpha(mdl, si, model[to], to, sent, -1);

                    update_alpha(mdl, si, empirical[to], to, sent, +1);
                }


            }
        } else {
            log_info("Sentence %d (section %d) of length %d (Perfect parse)", si, sent->section, sent->length);
        }

        size_t nsuccess;
        if (budget_method == RANDOMIZED) {
            if (mdl->M > budget_target) {
                size_t nbefore = mdl->M;
                size_t nasked = nbefore - budget_target;
                nsuccess = delete_n_random_hypothesis(mdl, nasked);

                log_info("%lu vectors deleted (%lu asked). Current hypothesis set size reduced from %lu to %lu", nsuccess, nasked, nbefore, mdl->M);
            }
        }

        mdl->c++;

        free_feature_matrix(corpus, si);

        match += nm;
        total += (sent->length);


        if ((si + 1) % 1000 == 0 && si != 0) {
            log_info("Running training accuracy %lf after %d sentence.", (match * 1.) / total, si + 1);

            unsigned nsv = mdl->M;
            log_info("%u (%f of total %d) support vectors", nsv, (nsv * 1.) / max_sv, max_sv);
        }

        free(model);
        free(empirical);
    }

    unsigned nsv = mdl->M;
    log_info("Running training accuracy %lf", (match * 1.) / total);
    log_info("%u (%f of total %d) support vectors", nsv, (nsv * 1.) / max_sv, max_sv);

    if (verbosity > 0) {

        dump_support_vectors(mdl);



    }

    update_average_alpha(mdl);

    return;
}
Exemplo n.º 8
0
void train_once_PerceptronModel(PerceptronModel mdl, const CoNLLCorpus corpus, int max_rec) {
    long match = 0, total = 0;
    //size_t slen=0;

    log_info("Total number of training instances %d", (max_rec == -1) ? DArray_count(corpus->sentences) : max_rec);
    for (int si = 0; si < ((max_rec == -1) ? DArray_count(corpus->sentences) : max_rec); si++) {
        //log_info("Parsing sentence %d/%d", si+1, DArray_count(corpus));
        FeaturedSentence sent = (FeaturedSentence) DArray_get(corpus->sentences, si);


        start(&parser_rate);
        //debug("Building feature matrix for sentence %d of length %d", si, sent->length);
        //set_FeatureMatrix(NULL, corpus, si);


        //printfembedding(sent->feature_matrix, sent->length);

        debug("Building adjacency matrix for sentence %d of length %d", si, sent->length);
        build_adjacency_matrix(corpus, si, mdl->embedding_w, NULL);

        //printfmatrix(sent->adjacency_matrix, sent->length);

        //log_info("Adjacency matrix construction is done");


        int *model = parse(sent);
        stop(&parser_rate);
        debug("Parsing sentence %d is done", si);
        int *empirical = get_parents(sent);

        /*
        log_info("Model:");
        printfarch(model, sent->length);
        log_info("Empirical:");
        printfarch(empirical, sent->length);
         */

        int nm = nmatch(model, empirical, sent->length);
        debug("Model matches %d arcs out of %d arcs", nm, sent->length);
        if (nm != sent->length) { // root has no valid parent.

            if (corpus->disrete_patterns_parts) {

                log_info("I have discrete features");

                DArray* model_features = DArray_create(sizeof (uint32_t), 16);
                DArray* empirical_features = DArray_create(sizeof (uint32_t), 16);

                for (int fi = 1; fi < sent->length; fi++) {
                    fill_features(mdl->features->map, model_features, model[fi], fi, sent);

                    fill_features(mdl->features->map, empirical_features, empirical[fi], fi, sent);
                }

                for (int i = 0; i < DArray_count(model_features); i++) {
                    uint32_t *fidx = (uint32_t *) DArray_get(model_features, i);

                    mdl->discrete_w->data[*fidx] -= 1.0;
                    mdl->discrete_w_avg->data[*fidx] -= (mdl->c) * 1.0;
                    mdl->discrete_w_temp->data[*fidx] -= 1.0;

                }

                for (int i = 0; i < DArray_count(empirical_features); i++) {
                    uint32_t *fidx = (uint32_t *) DArray_get(empirical_features, i);

                    mdl->discrete_w->data[*fidx] += 1.0;
                    mdl->discrete_w_avg->data[*fidx] += (mdl->c) * 1.0;

                    mdl->discrete_w_temp->data[*fidx] += 1.0;
                }

                DArray_destroy(model_features);
                DArray_destroy(empirical_features);
            }

            for (int i = 1; i <= sent->length; i++) {

                if (model[i] != empirical[i]){
                    // -1 for Model arch
                    embedding_feature(sent, model[i], i, xformed_v);
                    vadd(mdl->embedding_w, xformed_v, -1.0);
                    vadd(mdl->embedding_w_temp, xformed_v, -1.0);
                    vadd(mdl->embedding_w_avg, xformed_v, -(mdl->c));

                    // +1 for Gold arc
                    embedding_feature(sent, empirical[i], i, xformed_v);

                    vadd(mdl->embedding_w, xformed_v, 1.0);
                    vadd(mdl->embedding_w_temp, xformed_v, 1.0);
                    vadd(mdl->embedding_w_avg, xformed_v, (mdl->c));
                }

                //free(real_embedding);
                //free(model_embedding);
            }
        }

        free_feature_matrix(corpus, si);

        mdl->c++;

        match += nm;
        total += (sent->length);

        if (si % 1000 == 0 && si > 0) {
            log_info("Running training accuracy %lf", (match * 1.) / total);

        }


        free(model);
        free(empirical);

        //free_sentence_structures(sent);
    }

    log_info("Running training accuracy %lf", (match * 1.) / total);

    if (corpus->disrete_patterns_parts) {
        for (int i = 0; i < mdl->n; i++) {
            //        mdl->w_avg[i] /= (numit * DArray_count(corpus));

            //mdl->w[i] -=(mdl->w_avg[i])/(mdl->c);

            mdl->discrete_w_temp->data[i] = mdl->discrete_w->data[i] - (mdl->discrete_w_avg->data[i]) / (mdl->c);
        }
    }

    //vadd(mdl->w_cont, mdl->w_cont_avg, -1./(mdl->c), SCODE_FEATURE_VECTOR_LENGTH);
    memcpy(mdl->embedding_w_temp->data, mdl->embedding_w->data, mdl->embedding_w->n * sizeof (float));
    vadd(mdl->embedding_w_temp, mdl->embedding_w_avg, -1. / (mdl->c));

    //    free(mdl->w);
    //    mdl->w = mdl->w_avg;

    //free_feature_matrix(sent);
}
Exemplo n.º 9
0
Arquivo: lda.c Projeto: JeyKeu/Twitter
int
main (int argc, char *argv[])
{
	document *data;
	double *alpha;
	double **beta;
	FILE *ap, *bp;		// for alpha, beta
	char c;
	int nlex, dlenmax;
	int nclass     = CLASS_DEFAULT;		// default in lda.h
	int emmax      = EMMAX_DEFAULT;		// default in lda.h
	int demmax     = DEMMAX_DEFAULT;	// default in lda.h
	double epsilon = EPSILON_DEFAULT;	// default in lda.h

	while ((c = getopt(argc, argv, "N:I:D:E:h")) != -1)
	{
		switch (c) {
			case 'N': nclass  = atoi(optarg); break;
			case 'I': emmax   = atoi(optarg); break;
			case 'D': demmax  = atoi(optarg); break;
			case 'E': epsilon = atof(optarg); break;
			case 'h': usage (); break;
			default : usage (); break;
		}
	}
	if (!(argc - optind == 2))
		usage ();

	/* open data */
	if ((data = feature_matrix(argv[optind], &nlex, &dlenmax)) == NULL) {
		fprintf(stderr, "lda:: cannot open training data.\n");
		exit(1);
	}
	/* allocate parameters */
	if ((alpha = (double *)calloc(nclass, sizeof(double))) == NULL) {
		fprintf(stderr, "lda:: cannot allocate alpha.\n");
		exit(1);
	}
	if ((beta = dmatrix(nlex, nclass)) == NULL) {
		fprintf(stderr, "lda:: cannot allocate beta.\n");
		exit(1);
	}
	/* open model outputs */
	if (((ap = fopen(strconcat(argv[optind + 1], ".alpha"), "w")) == NULL)
	 || ((bp = fopen(strconcat(argv[optind + 1], ".beta"), "w"))  == NULL))
	{
		fprintf(stderr, "lda:: cannot open model outputs.\n");
		exit(1);
	}

	lda_learn (data, alpha, beta, nclass, nlex, dlenmax,
		   emmax, demmax, epsilon);
	lda_write (ap, bp, alpha, beta, nclass, nlex);

	free_feature_matrix(data);
	free_dmatrix(beta, nlex);
	free(alpha);
	
	fclose(ap);
	fclose(bp);

	exit(0);
}