/* rdr_loadpat: * Load and compile patterns from given file and store them in the reader. As * we compile patterns, syntax errors in them will be raised at this time. */ void rdr_loadpat(rdr_t *rdr, FILE *file) { while (!feof(file)) { // Read raw input line char *line = rdr_readline(file); if (line == NULL) break; // Remove comments and trailing spaces int end = strcspn(line, "#"); while (end != 0 && isspace(line[end - 1])) end--; if (end == 0) { free(line); continue; } line[end] = '\0'; line[0] = tolower(line[0]); // Compile pattern and add it to the list pat_t *pat = pat_comp(line); rdr->npats++; switch (line[0]) { case 'u': rdr->nuni++; break; case 'b': rdr->nbi++; break; case '*': rdr->nuni++; rdr->nbi++; break; default: fatal("unknown pattern type '%c'", line[0]); } rdr->pats = xrealloc(rdr->pats, sizeof(char *) * rdr->npats); rdr->pats[rdr->npats - 1] = pat; rdr->ntoks = max(rdr->ntoks, pat->ntoks); } }
/* rdr_readraw: * Read a raw sequence from given file: a set of lines terminated by end of * file or by an empty line. Return NULL if file end was reached before any * sequence was read. */ raw_t *rdr_readraw(rdr_t *rdr, FILE *file) { if (feof(file)) return NULL; // Prepare the raw sequence object uint32_t size = 32, cnt = 0; raw_t *raw = xmalloc(sizeof(raw_t) + sizeof(char *) * size); // And read the next sequence in the file, this will skip any blank line // before reading the sequence stoping at end of file or on a new blank // line. while (!feof(file)) { char *line = rdr_readline(file); if (line == NULL) break; // Check for empty line marking the end of the current sequence int len = strlen(line); while (len != 0 && isspace(line[len - 1])) len--; if (len == 0) { free(line); // Special case when no line was already read, we try // again. This allow multiple blank lines beetwen // sequences. if (cnt == 0) continue; break; } // Next, grow the buffer if needed and add the new line in it if (size == cnt) { size *= 1.4; raw = xrealloc(raw, sizeof(raw_t) + sizeof(char *) * size); } raw->lines[cnt++] = line; // In autouni mode, there will be only unigram features so we // can use small sequences to improve multi-theading. if (rdr->autouni) break; } // If no lines was read, we just free allocated memory and return NULL // to signal the end of file to the caller. Else, we adjust the object // size and return it. if (cnt == 0) { free(raw); return NULL; } raw = xrealloc(raw, sizeof(raw_t) + sizeof(char *) * cnt); raw->len = cnt; return raw; }
/******************************************************************************* * Updating ******************************************************************************/ static void doupdt(mdl_t *mdl) { // Load input model file info("* Load model\n"); if (mdl->opt->model == NULL) fatal("no model file provided"); FILE *Min = fopen(mdl->opt->model, "r"); if (Min == NULL) pfatal("cannot open model file %s", mdl->opt->model); mdl_load(mdl, Min); fclose(Min); // Open patch file info("* Update model\n"); FILE *fin = stdin; if (mdl->opt->input != NULL) { fin = fopen(mdl->opt->input, "r"); if (fin == NULL) pfatal("cannot open update file"); } int nline = 0; while (!feof(fin)) { char *raw = rdr_readline(fin); if (raw == NULL) break; char *line = raw; nline++; // First we split the line in space separated tokens. We expect // four of them and skip empty lines. char *toks[4]; int ntoks = 0; while (ntoks < 4) { while (isspace(*line)) line++; if (*line == '\0') break; toks[ntoks++] = line; while (*line != '\0' && !isspace(*line)) line++; if (*line == '\0') break; *line++ = '\0'; } if (ntoks == 0) { free(raw); continue; } else if (ntoks != 4) { fatal("invalid line at %d", nline); } // Parse the tokens, the first three should be string maping to // observations and labels and the last should be the weight. uint64_t obs = none, yp = none, y = none; obs = qrk_str2id(mdl->reader->obs, toks[0]); if (obs == none) fatal("bad on observation on line %d", nline); if (strcmp(toks[1], "#")) { yp = qrk_str2id(mdl->reader->lbl, toks[1]); if (yp == none) fatal("bad label <%s> line %d", toks[1], nline); } y = qrk_str2id(mdl->reader->lbl, toks[2]); if (y == none) fatal("bad label <%s> line %d", toks[2], nline); double wgh = 0.0; if (sscanf(toks[3], "%lf", &wgh) != 1) fatal("bad weight on line %d", nline); const uint32_t Y = mdl->nlbl; if (yp == none) { double *w = mdl->theta + mdl->uoff[obs]; w[y] = wgh; } else { double *w = mdl->theta + mdl->boff[obs]; w[yp * Y + y] = wgh; } free(raw); } if (mdl->opt->input != NULL) fclose(fin); // If requested compact the model. if (mdl->opt->compact) { const uint64_t O = mdl->nobs; const uint64_t F = mdl->nftr; info("* Compacting the model\n"); mdl_compact(mdl); info(" %8"PRIu64" observations removed\n", O - mdl->nobs); info(" %8"PRIu64" features removed\n", F - mdl->nftr); } // And save the updated model info("* Save the model\n"); FILE *file = stdout; if (mdl->opt->output != NULL) { file = fopen(mdl->opt->output, "w"); if (file == NULL) pfatal("cannot open output model"); } mdl_save(mdl, file); if (mdl->opt->output != NULL) fclose(file); info("* Done\n"); }