Esempio n. 1
0
void dotrain(mdl_t *mdl) {
	// Check if the user requested the trainer list. If this is not the
	// case, search the trainer.
	if (!strcmp(mdl->opt->algo, "list")) {
		info("Available training algorithms:\n");
		for (int i = 0; i < trn_cnt; i++)
			info("\t%s\n", trn_lst[i].name);
		exit(EXIT_SUCCESS);
	}
	int trn;
	for (trn = 0; trn < trn_cnt; trn++)
		if (!strcmp(mdl->opt->algo, trn_lst[trn].name))
			break;
	if (trn == trn_cnt)
		fatal("unknown algorithm '%s'", mdl->opt->algo);
	// Load a previous model to train again if specified by the user.
	if (mdl->opt->model != NULL) {
		info("* Load previous model\n");
		FILE *file = fopen(mdl->opt->model, "r");
		if (file == NULL)
			pfatal("cannot open input model file");
		mdl_load(mdl, file);
	}
	// Load the pattern file. This will unlock the database if previously
	// locked by loading a model.
	if (mdl->opt->pattern != NULL) {
		info("* Load patterns\n");
		FILE *file = fopen(mdl->opt->pattern, "r");
		if (file == NULL)
			pfatal("cannot open pattern file");
		rdr_loadpat(mdl->reader, file);
		fclose(file);
		qrk_lock(mdl->reader->obs, false);
	}
	// Load the training data. When this is done we lock the quarks as we
	// don't want to put in the model, informations present only in the
	// devlopment set.
	info("* Load training data\n");
	FILE *file = stdin;
	if (mdl->opt->input != NULL) {
		file = fopen(mdl->opt->input, "r");
		if (file == NULL)
			pfatal("cannot open input data file");
	}
	mdl->train = rdr_readdat(mdl->reader, file, true);
	if (mdl->opt->input != NULL)
		fclose(file);
	qrk_lock(mdl->reader->lbl, true);
	qrk_lock(mdl->reader->obs, true);
	if (mdl->train == NULL || mdl->train->nseq == 0)
		fatal("no train data loaded");
	// If present, load the development set in the model. If not specified,
	// the training dataset will be used instead.
	if (mdl->opt->devel != NULL) {
		info("* Load development data\n");
		FILE *file = fopen(mdl->opt->devel, "r");
		if (file == NULL)
			pfatal("cannot open development file");
		mdl->devel = rdr_readdat(mdl->reader, file, true);
		fclose(file);
	}
	// Initialize the model. If a previous model was loaded, this will be
	// just a resync, else the model structure will be created.
	if (mdl->theta == NULL)
		info("* Initialize the model\n");
	else
		info("* Resync the model\n");
	mdl_sync(mdl);
	// Display some statistics as we all love this.
	info("* Summary\n");
	info("    nb train:    %d\n", mdl->train->nseq);
	if (mdl->devel != NULL)
		info("    nb devel:    %d\n", mdl->devel->nseq);
	info("    nb labels:   %zu\n", mdl->nlbl);
	info("    nb blocks:   %zu\n", mdl->nobs);
	info("    nb features: %zu\n", mdl->nftr);
	// And train the model...
	info("* Train the model with %s\n", mdl->opt->algo);
	uit_setup(mdl);
	trn_lst[trn].train(mdl);
	uit_cleanup(mdl);
	// If requested compact the model.
	if (mdl->opt->compact) {
		const size_t O = mdl->nobs;
		const size_t F = mdl->nftr;
		info("* Compacting the model\n");
		mdl_compact(mdl);
		info("    %8zu observations removed\n", O - mdl->nobs);
		info("    %8zu features removed\n", F - mdl->nftr);
	}
	// And save the trained model
	info("* Save the model\n");
	file = stdout;
	if (mdl->opt->output != NULL) {
		file = fopen(mdl->opt->output, "w");
		if (file == NULL)
			pfatal("cannot open output model");
	}
	mdl_save(mdl, file);
	if (mdl->opt->output != NULL)
		fclose(file);
	info("* Done\n");
}
Esempio n. 2
0
/*******************************************************************************
 * Updating
 ******************************************************************************/
static void doupdt(mdl_t *mdl) {
    // Load input model file
    info("* Load model\n");
    if (mdl->opt->model == NULL)
        fatal("no model file provided");
    FILE *Min = fopen(mdl->opt->model, "r");
    if (Min == NULL)
        pfatal("cannot open model file %s", mdl->opt->model);
    mdl_load(mdl, Min);
    fclose(Min);
    // Open patch file
    info("* Update model\n");
    FILE *fin = stdin;
    if (mdl->opt->input != NULL) {
        fin = fopen(mdl->opt->input, "r");
        if (fin == NULL)
            pfatal("cannot open update file");
    }
    int nline = 0;
    while (!feof(fin)) {
        char *raw = rdr_readline(fin);
        if (raw == NULL)
            break;
        char *line = raw;
        nline++;
        // First we split the line in space separated tokens. We expect
        // four of them and skip empty lines.
        char *toks[4];
        int ntoks = 0;
        while (ntoks < 4) {
            while (isspace(*line))
                line++;
            if (*line == '\0')
                break;
            toks[ntoks++] = line;
            while (*line != '\0' && !isspace(*line))
                line++;
            if (*line == '\0')
                break;
            *line++ = '\0';
        }
        if (ntoks == 0) {
            free(raw);
            continue;
        } else if (ntoks != 4) {
            fatal("invalid line at %d", nline);
        }
        // Parse the tokens, the first three should be string maping to
        // observations and labels and the last should be the weight.
        uint64_t obs = none, yp = none, y = none;
        obs = qrk_str2id(mdl->reader->obs, toks[0]);
        if (obs == none)
            fatal("bad on observation on line %d", nline);
        if (strcmp(toks[1], "#")) {
            yp = qrk_str2id(mdl->reader->lbl, toks[1]);
            if (yp == none)
                fatal("bad label <%s> line %d", toks[1], nline);
        }
        y = qrk_str2id(mdl->reader->lbl, toks[2]);
        if (y == none)
            fatal("bad label <%s> line %d", toks[2], nline);
        double wgh = 0.0;
        if (sscanf(toks[3], "%lf", &wgh) != 1)
            fatal("bad weight on line %d", nline);

        const uint32_t Y = mdl->nlbl;
        if (yp == none) {
            double *w = mdl->theta + mdl->uoff[obs];
            w[y] = wgh;
        } else {
            double *w = mdl->theta + mdl->boff[obs];
            w[yp * Y + y] = wgh;
        }
        free(raw);
    }
    if (mdl->opt->input != NULL)
        fclose(fin);
    // If requested compact the model.
    if (mdl->opt->compact) {
        const uint64_t O = mdl->nobs;
        const uint64_t F = mdl->nftr;
        info("* Compacting the model\n");
        mdl_compact(mdl);
        info("    %8"PRIu64" observations removed\n", O - mdl->nobs);
        info("    %8"PRIu64" features removed\n", F - mdl->nftr);
    }
    // And save the updated model
    info("* Save the model\n");
    FILE *file = stdout;
    if (mdl->opt->output != NULL) {
        file = fopen(mdl->opt->output, "w");
        if (file == NULL)
            pfatal("cannot open output model");
    }
    mdl_save(mdl, file);
    if (mdl->opt->output != NULL)
        fclose(file);
    info("* Done\n");
}