/******************************************************************************* * Labeling ******************************************************************************/ void dolabel(mdl_t *mdl) { // First, load the model provided by the user. This is mandatory to // label new datas ;-) if (mdl->opt->model == NULL) fatal("you must specify a model"); info("* Load model\n"); FILE *file = fopen(mdl->opt->model, "r"); if (file == NULL) pfatal("cannot open input model file"); mdl_load(mdl, file); // Open input and output files FILE *fin = stdin, *fout = stdout; if (mdl->opt->input != NULL) { fin = fopen(mdl->opt->input, "r"); if (fin == NULL) pfatal("cannot open input data file"); } if (mdl->opt->output != NULL) { fout = fopen(mdl->opt->output, "w"); if (fout == NULL) pfatal("cannot open output data file"); } // Do the labelling info("* Label sequences\n"); tag_label(mdl, fin, fout); info("* Done\n"); // And close files if (mdl->opt->input != NULL) fclose(fin); if (mdl->opt->output != NULL) fclose(fout); }
/******************************************************************************* * Dumping ******************************************************************************/ static void dodump(mdl_t *mdl) { // Load input model file info("* Load model\n"); FILE *fin = stdin; if (mdl->opt->input != NULL) { fin = fopen(mdl->opt->input, "r"); if (fin == NULL) pfatal("cannot open input data file"); } mdl_load(mdl, fin); if (mdl->opt->input != NULL) fclose(fin); // Open output file FILE *fout = stdout; if (mdl->opt->output != NULL) { fout = fopen(mdl->opt->output, "w"); if (fout == NULL) pfatal("cannot open output data file"); } // Dump model info("* Dump model\n"); const uint32_t Y = mdl->nlbl; const uint64_t O = mdl->nobs; const qrk_t *Qlbl = mdl->reader->lbl; const qrk_t *Qobs = mdl->reader->obs; char fmt[16]; sprintf(fmt, "%%.%df\n", mdl->opt->prec); for (uint64_t o = 0; o < O; o++) { const char *obs = qrk_id2str(Qobs, o); bool empty = true; if (mdl->kind[o] & 1) { const double *w = mdl->theta + mdl->uoff[o]; for (uint32_t y = 0; y < Y; y++) { if (!mdl->opt->all && w[y] == 0.0) continue; const char *ly = qrk_id2str(Qlbl, y); fprintf(fout, "%s\t#\t%s\t", obs, ly); fprintf(fout, fmt, w[y]); empty = false; } } if (mdl->kind[o] & 2) { const double *w = mdl->theta + mdl->boff[o]; for (uint32_t d = 0; d < Y * Y; d++) { if (!mdl->opt->all && w[d] == 0.0) continue; const char *ly = qrk_id2str(Qlbl, d % Y); const char *lyp = qrk_id2str(Qlbl, d / Y); fprintf(fout, "%s\t%s\t%s\t", obs, lyp, ly); fprintf(fout, fmt, w[d]); empty = false; } } if (!empty) fprintf(fout, "\n"); } if (mdl->opt->output != NULL) fclose(fout); }
void dotrain(mdl_t *mdl) { // Check if the user requested the trainer list. If this is not the // case, search the trainer. if (!strcmp(mdl->opt->algo, "list")) { info("Available training algorithms:\n"); for (int i = 0; i < trn_cnt; i++) info("\t%s\n", trn_lst[i].name); exit(EXIT_SUCCESS); } int trn; for (trn = 0; trn < trn_cnt; trn++) if (!strcmp(mdl->opt->algo, trn_lst[trn].name)) break; if (trn == trn_cnt) fatal("unknown algorithm '%s'", mdl->opt->algo); // Load a previous model to train again if specified by the user. if (mdl->opt->model != NULL) { info("* Load previous model\n"); FILE *file = fopen(mdl->opt->model, "r"); if (file == NULL) pfatal("cannot open input model file"); mdl_load(mdl, file); } // Load the pattern file. This will unlock the database if previously // locked by loading a model. if (mdl->opt->pattern != NULL) { info("* Load patterns\n"); FILE *file = fopen(mdl->opt->pattern, "r"); if (file == NULL) pfatal("cannot open pattern file"); rdr_loadpat(mdl->reader, file); fclose(file); qrk_lock(mdl->reader->obs, false); } // Load the training data. When this is done we lock the quarks as we // don't want to put in the model, informations present only in the // devlopment set. info("* Load training data\n"); FILE *file = stdin; if (mdl->opt->input != NULL) { file = fopen(mdl->opt->input, "r"); if (file == NULL) pfatal("cannot open input data file"); } mdl->train = rdr_readdat(mdl->reader, file, true); if (mdl->opt->input != NULL) fclose(file); qrk_lock(mdl->reader->lbl, true); qrk_lock(mdl->reader->obs, true); if (mdl->train == NULL || mdl->train->nseq == 0) fatal("no train data loaded"); // If present, load the development set in the model. If not specified, // the training dataset will be used instead. if (mdl->opt->devel != NULL) { info("* Load development data\n"); FILE *file = fopen(mdl->opt->devel, "r"); if (file == NULL) pfatal("cannot open development file"); mdl->devel = rdr_readdat(mdl->reader, file, true); fclose(file); } // Initialize the model. If a previous model was loaded, this will be // just a resync, else the model structure will be created. if (mdl->theta == NULL) info("* Initialize the model\n"); else info("* Resync the model\n"); mdl_sync(mdl); // Display some statistics as we all love this. info("* Summary\n"); info(" nb train: %d\n", mdl->train->nseq); if (mdl->devel != NULL) info(" nb devel: %d\n", mdl->devel->nseq); info(" nb labels: %zu\n", mdl->nlbl); info(" nb blocks: %zu\n", mdl->nobs); info(" nb features: %zu\n", mdl->nftr); // And train the model... info("* Train the model with %s\n", mdl->opt->algo); uit_setup(mdl); trn_lst[trn].train(mdl); uit_cleanup(mdl); // If requested compact the model. if (mdl->opt->compact) { const size_t O = mdl->nobs; const size_t F = mdl->nftr; info("* Compacting the model\n"); mdl_compact(mdl); info(" %8zu observations removed\n", O - mdl->nobs); info(" %8zu features removed\n", F - mdl->nftr); } // And save the trained model info("* Save the model\n"); file = stdout; if (mdl->opt->output != NULL) { file = fopen(mdl->opt->output, "w"); if (file == NULL) pfatal("cannot open output model"); } mdl_save(mdl, file); if (mdl->opt->output != NULL) fclose(file); info("* Done\n"); }
/******************************************************************************* * Updating ******************************************************************************/ static void doupdt(mdl_t *mdl) { // Load input model file info("* Load model\n"); if (mdl->opt->model == NULL) fatal("no model file provided"); FILE *Min = fopen(mdl->opt->model, "r"); if (Min == NULL) pfatal("cannot open model file %s", mdl->opt->model); mdl_load(mdl, Min); fclose(Min); // Open patch file info("* Update model\n"); FILE *fin = stdin; if (mdl->opt->input != NULL) { fin = fopen(mdl->opt->input, "r"); if (fin == NULL) pfatal("cannot open update file"); } int nline = 0; while (!feof(fin)) { char *raw = rdr_readline(fin); if (raw == NULL) break; char *line = raw; nline++; // First we split the line in space separated tokens. We expect // four of them and skip empty lines. char *toks[4]; int ntoks = 0; while (ntoks < 4) { while (isspace(*line)) line++; if (*line == '\0') break; toks[ntoks++] = line; while (*line != '\0' && !isspace(*line)) line++; if (*line == '\0') break; *line++ = '\0'; } if (ntoks == 0) { free(raw); continue; } else if (ntoks != 4) { fatal("invalid line at %d", nline); } // Parse the tokens, the first three should be string maping to // observations and labels and the last should be the weight. uint64_t obs = none, yp = none, y = none; obs = qrk_str2id(mdl->reader->obs, toks[0]); if (obs == none) fatal("bad on observation on line %d", nline); if (strcmp(toks[1], "#")) { yp = qrk_str2id(mdl->reader->lbl, toks[1]); if (yp == none) fatal("bad label <%s> line %d", toks[1], nline); } y = qrk_str2id(mdl->reader->lbl, toks[2]); if (y == none) fatal("bad label <%s> line %d", toks[2], nline); double wgh = 0.0; if (sscanf(toks[3], "%lf", &wgh) != 1) fatal("bad weight on line %d", nline); const uint32_t Y = mdl->nlbl; if (yp == none) { double *w = mdl->theta + mdl->uoff[obs]; w[y] = wgh; } else { double *w = mdl->theta + mdl->boff[obs]; w[yp * Y + y] = wgh; } free(raw); } if (mdl->opt->input != NULL) fclose(fin); // If requested compact the model. if (mdl->opt->compact) { const uint64_t O = mdl->nobs; const uint64_t F = mdl->nftr; info("* Compacting the model\n"); mdl_compact(mdl); info(" %8"PRIu64" observations removed\n", O - mdl->nobs); info(" %8"PRIu64" features removed\n", F - mdl->nftr); } // And save the updated model info("* Save the model\n"); FILE *file = stdout; if (mdl->opt->output != NULL) { file = fopen(mdl->opt->output, "w"); if (file == NULL) pfatal("cannot open output model"); } mdl_save(mdl, file); if (mdl->opt->output != NULL) fclose(file); info("* Done\n"); }