Exemple #1
0
/* rdr_loadpat:
 *   Load and compile patterns from given file and store them in the reader. As
 *   we compile patterns, syntax errors in them will be raised at this time.
 */
void rdr_loadpat(rdr_t *rdr, FILE *file) {
	while (!feof(file)) {
		// Read raw input line
		char *line = rdr_readline(file);
		if (line == NULL)
			break;
		// Remove comments and trailing spaces
		int end = strcspn(line, "#");
		while (end != 0 && isspace(line[end - 1]))
			end--;
		if (end == 0) {
			free(line);
			continue;
		}
		line[end] = '\0';
		line[0] = tolower(line[0]);
		// Compile pattern and add it to the list
		pat_t *pat = pat_comp(line);
		rdr->npats++;
		switch (line[0]) {
			case 'u': rdr->nuni++; break;
			case 'b': rdr->nbi++; break;
			case '*': rdr->nuni++;
			          rdr->nbi++; break;
			default:
				fatal("unknown pattern type '%c'", line[0]);
		}
		rdr->pats = xrealloc(rdr->pats, sizeof(char *) * rdr->npats);
		rdr->pats[rdr->npats - 1] = pat;
		rdr->ntoks = max(rdr->ntoks, pat->ntoks);
	}
}
Exemple #2
0
/* rdr_readraw:
 *   Read a raw sequence from given file: a set of lines terminated by end of
 *   file or by an empty line. Return NULL if file end was reached before any
 *   sequence was read.
 */
raw_t *rdr_readraw(rdr_t *rdr, FILE *file) {
	if (feof(file))
		return NULL;
	// Prepare the raw sequence object
	uint32_t size = 32, cnt = 0;
	raw_t *raw = xmalloc(sizeof(raw_t) + sizeof(char *) * size);
	// And read the next sequence in the file, this will skip any blank line
	// before reading the sequence stoping at end of file or on a new blank
	// line.
	while (!feof(file)) {
		char *line = rdr_readline(file);
		if (line == NULL)
			break;
		// Check for empty line marking the end of the current sequence
		int len = strlen(line);
		while (len != 0 && isspace(line[len - 1]))
			len--;
		if (len == 0) {
			free(line);
			// Special case when no line was already read, we try
			// again. This allow multiple blank lines beetwen
			// sequences.
			if (cnt == 0)
				continue;
			break;
		}
		// Next, grow the buffer if needed and add the new line in it
		if (size == cnt) {
			size *= 1.4;
			raw = xrealloc(raw, sizeof(raw_t)
			                + sizeof(char *) * size);
		}
		raw->lines[cnt++] = line;
		// In autouni mode, there will be only unigram features so we
		// can use small sequences to improve multi-theading.
		if (rdr->autouni) 
		  break;
	}
	// If no lines was read, we just free allocated memory and return NULL
	// to signal the end of file to the caller. Else, we adjust the object
	// size and return it.
	if (cnt == 0) {
		free(raw);
		return NULL;
	}
	raw = xrealloc(raw, sizeof(raw_t) + sizeof(char *) * cnt);
	raw->len = cnt;
	return raw;
}
Exemple #3
0
/*******************************************************************************
 * Updating
 ******************************************************************************/
static void doupdt(mdl_t *mdl) {
    // Load input model file
    info("* Load model\n");
    if (mdl->opt->model == NULL)
        fatal("no model file provided");
    FILE *Min = fopen(mdl->opt->model, "r");
    if (Min == NULL)
        pfatal("cannot open model file %s", mdl->opt->model);
    mdl_load(mdl, Min);
    fclose(Min);
    // Open patch file
    info("* Update model\n");
    FILE *fin = stdin;
    if (mdl->opt->input != NULL) {
        fin = fopen(mdl->opt->input, "r");
        if (fin == NULL)
            pfatal("cannot open update file");
    }
    int nline = 0;
    while (!feof(fin)) {
        char *raw = rdr_readline(fin);
        if (raw == NULL)
            break;
        char *line = raw;
        nline++;
        // First we split the line in space separated tokens. We expect
        // four of them and skip empty lines.
        char *toks[4];
        int ntoks = 0;
        while (ntoks < 4) {
            while (isspace(*line))
                line++;
            if (*line == '\0')
                break;
            toks[ntoks++] = line;
            while (*line != '\0' && !isspace(*line))
                line++;
            if (*line == '\0')
                break;
            *line++ = '\0';
        }
        if (ntoks == 0) {
            free(raw);
            continue;
        } else if (ntoks != 4) {
            fatal("invalid line at %d", nline);
        }
        // Parse the tokens, the first three should be string maping to
        // observations and labels and the last should be the weight.
        uint64_t obs = none, yp = none, y = none;
        obs = qrk_str2id(mdl->reader->obs, toks[0]);
        if (obs == none)
            fatal("bad on observation on line %d", nline);
        if (strcmp(toks[1], "#")) {
            yp = qrk_str2id(mdl->reader->lbl, toks[1]);
            if (yp == none)
                fatal("bad label <%s> line %d", toks[1], nline);
        }
        y = qrk_str2id(mdl->reader->lbl, toks[2]);
        if (y == none)
            fatal("bad label <%s> line %d", toks[2], nline);
        double wgh = 0.0;
        if (sscanf(toks[3], "%lf", &wgh) != 1)
            fatal("bad weight on line %d", nline);

        const uint32_t Y = mdl->nlbl;
        if (yp == none) {
            double *w = mdl->theta + mdl->uoff[obs];
            w[y] = wgh;
        } else {
            double *w = mdl->theta + mdl->boff[obs];
            w[yp * Y + y] = wgh;
        }
        free(raw);
    }
    if (mdl->opt->input != NULL)
        fclose(fin);
    // If requested compact the model.
    if (mdl->opt->compact) {
        const uint64_t O = mdl->nobs;
        const uint64_t F = mdl->nftr;
        info("* Compacting the model\n");
        mdl_compact(mdl);
        info("    %8"PRIu64" observations removed\n", O - mdl->nobs);
        info("    %8"PRIu64" features removed\n", F - mdl->nftr);
    }
    // And save the updated model
    info("* Save the model\n");
    FILE *file = stdout;
    if (mdl->opt->output != NULL) {
        file = fopen(mdl->opt->output, "w");
        if (file == NULL)
            pfatal("cannot open output model");
    }
    mdl_save(mdl, file);
    if (mdl->opt->output != NULL)
        fclose(file);
    info("* Done\n");
}