/* qrk_new: * This initialize the object for holding a new empty trie, with some pre- * allocations. The returned object must be freed with a call to qrk_free when * not needed anymore. */ qrk_t *qrk_new(void) { const uint64_t size = 128; qrk_t *qrk = wapiti_xmalloc(sizeof(qrk_t)); qrk->root = NULL; qrk->count = 0; qrk->lock = false; qrk->size = size; qrk->leafs = wapiti_xmalloc(sizeof(leaf_t) * size); return qrk; }
/* tag_eval: * Compute the token error rate and sequence error rate over the devel set (or * taining set if not available). */ void tag_eval(mdl_t *mdl, double *te, double *se) { const uint32_t W = mdl->opt->nthread; dat_t *dat = (mdl->devel == NULL) ? mdl->train : mdl->devel; // First we prepare the eval state for all the workers threads, we just // have to give them the model and dataset to use. This state will be // used to retrieve partial result they computed. eval_t *eval[W]; for (uint32_t w = 0; w < W; w++) { eval[w] = wapiti_xmalloc(sizeof(eval_t)); eval[w]->mdl = mdl; eval[w]->dat = dat; } // And next, we call the workers to do the job and reduce the partial // result by summing them and computing the final error rates. mth_spawn((func_t *)tag_evalsub, W, (void *)eval, dat->nseq, mdl->opt->jobsize); uint64_t tcnt = 0, terr = 0; uint64_t scnt = 0, serr = 0; for (uint32_t w = 0; w < W; w++) { tcnt += eval[w]->tcnt; terr += eval[w]->terr; scnt += eval[w]->scnt; serr += eval[w]->serr; free(eval[w]); } *te = (double)terr / tcnt * 100.0; *se = (double)serr / scnt * 100.0; }
/* tag_evalsub: * This is where the real evaluation is done by the workers, we process data * by batch and for each batch do a simple Viterbi and scan the result to find * errors. */ static void tag_evalsub(job_t *job, uint32_t id, uint32_t cnt, eval_t *eval) { unused(id && cnt); mdl_t *mdl = eval->mdl; dat_t *dat = eval->dat; eval->tcnt = 0; eval->terr = 0; eval->scnt = 0; eval->serr = 0; // We just get a job a process all the squence in it. uint32_t count, pos; while (mth_getjob(job, &count, &pos)) { for (uint32_t s = pos; s < pos + count; s++) { // Tag the sequence with the viterbi const seq_t *seq = dat->seq[s]; const uint32_t T = seq->len; uint32_t *out = wapiti_xmalloc(sizeof(uint32_t) * T); tag_viterbi(mdl, seq, out, NULL, NULL); // And check for eventual (probable ?) errors bool err = false; for (uint32_t t = 0; t < T; t++) if (seq->pos[t].lbl != out[t]) eval->terr++, err = true; eval->tcnt += T; eval->scnt += 1; eval->serr += err; free(out); } } }
/* uit_setup: * Install the signal handler for clean early stop from the user if possible * and start the timer. */ void uit_setup(mdl_t *mdl) { uit_stop = false; if (signal(SIGINT, uit_signal) == SIG_ERR) warning("failed to set signal handler, no clean early stop"); gettimeofday(&mdl->timer, NULL); if (mdl->opt->stopwin != 0) mdl->werr = wapiti_xmalloc(sizeof(double) * mdl->opt->stopwin); mdl->wcnt = mdl->wpos = 0; }
/* pat_exec: * Execute a compiled pattern at position 'at' in the given tokens sequences * in order to produce an observation string. The string is returned as a * newly allocated memory block and the caller is responsible to free it when * not needed anymore. */ char *pat_exec(const pat_t *pat, const tok_t *tok, int at) { static char *bval[] = {"_x-1", "_x-2", "_x-3", "_x-4", "_x-#"}; static char *eval[] = {"_x+1", "_x+2", "_x+3", "_x+4", "_x+#"}; const int T = tok->len; // Prepare the buffer who will hold the result int size = 16, pos = 0; char *buffer = wapiti_xmalloc(sizeof(char) * size); // And loop over the compiled items for (int it = 0; it < pat->nitems; it++) { const pat_item_t *item = &(pat->items[it]); char *value = NULL; int len = 0; // First, if needed, we retrieve the token at the referenced // position in the sequence. We store it in value and let the // command handler do what it need with it. if (item->type != 's') { int pos = item->offset; if (item->absolute) { if (item->offset < 0) pos += T; else pos--; } else { pos += at; } int col = item->column; if (pos < 0) value = bval[min(-pos - 1, 4)]; else if (pos >= T) value = eval[min( pos - T, 4)]; else if (col >= tok->cnts[pos]) fatal("missing tokens, cannot apply pattern"); else value = tok->toks[pos][col]; } // Next, we handle the command, 's' and 'x' are very simple but // 't' and 'm' require us to call the regexp matcher. if (item->type == 's') { value = item->value; len = strlen(value); } else if (item->type == 'x') { len = strlen(value); } else if (item->type == 't') { if (rex_match(item->value, value, &len) == -1) value = "false"; else value = "true"; len = strlen(value); } else if (item->type == 'm') { int pos = rex_match(item->value, value, &len); if (pos == -1) len = 0; value += pos; } // And we add it to the buffer, growing it if needed. If the // user requested it, we also remove caps from the string. if (pos + len >= size - 1) { while (pos + len >= size - 1) size = size * 1.4; buffer = wapiti_xrealloc(buffer, sizeof(char) * size); } memcpy(buffer + pos, value, len); if (item->caps) for (int i = pos; i < pos + len; i++) buffer[i] = tolower(buffer[i]); pos += len; } // Adjust the result and return it. buffer[pos++] = '\0'; buffer = wapiti_xrealloc(buffer, sizeof(char) * pos); return buffer; }
/* pat_comp: * Compile the pattern to a form more suitable to easily apply it on tokens * list during data reading. The given pattern string is interned in the * compiled pattern and will be freed with it, so you don't have to take care * of it and must not modify it after the compilation. */ pat_t *pat_comp(char *p) { pat_t *pat = NULL; // Allocate memory for the compiled pattern, the allocation is based // on an over-estimation of the number of required item. As compiled // pattern take a neglectible amount of memory, this waste is not // important. int mitems = 0; for (int pos = 0; p[pos] != '\0'; pos++) if (p[pos] == '%') mitems++; mitems = mitems * 2 + 1; pat = wapiti_xmalloc(sizeof(pat_t) + sizeof(pat->items[0]) * mitems); pat->src = p; // Next, we go through the pattern compiling the items as they are // found. Commands are parsed and put in a corresponding item, and // segment of char not in a command are put in a 's' item. int nitems = 0; int ntoks = 0; int pos = 0; while (p[pos] != '\0') { pat_item_t *item = &(pat->items[nitems++]); item->value = NULL; if (p[pos] == '%') { // This is a command, so first parse its type and check // its a valid one. Next prepare the item. const char type = tolower(p[pos + 1]); if (type != 'x' && type != 't' && type != 'm') fatal("unknown command type: '%c'", type); item->type = type; item->caps = (p[pos + 1] != type); pos += 2; // Next we parse the offset and column and store them in // the item. const char *at = p + pos; int off, col, nch; item->absolute = false; if (sscanf(at, "[@%d,%d%n", &off, &col, &nch) == 2) item->absolute = true; else if (sscanf(at, "[%d,%d%n", &off, &col, &nch) != 2) fatal("invalid pattern: %s", p); if (col < 0) fatal("invalid column number: %d", col); item->offset = off; item->column = col; ntoks = max(ntoks, col); pos += nch; // And parse the end of the argument list, for 'x' there // is nothing to read but for 't' and 'm' we have to get // read the regexp. if (type == 't' || type == 'm') { if (p[pos] != ',' && p[pos + 1] != '"') fatal("missing arg in pattern: %s", p); const int start = (pos += 2); while (p[pos] != '\0') { if (p[pos] == '"') break; if (p[pos] == '\\' && p[pos+1] != '\0') pos++; pos++; } if (p[pos] != '"') fatal("unended argument: %s", p); const int len = pos - start; item->value = wapiti_xmalloc(sizeof(char) * (len + 1)); memcpy(item->value, p + start, len); item->value[len] = '\0'; pos++; } // Just check the end of the arg list and loop. if (p[pos] != ']') fatal("missing end of pattern: %s", p); pos++; } else { // No command here, so build an 's' item with the chars // until end of pattern or next command and put it in // the list. const int start = pos; while (p[pos] != '\0' && p[pos] != '%') pos++; const int len = pos - start; item->type = 's'; item->caps = false; item->value = wapiti_xmalloc(sizeof(char) * (len + 1)); memcpy(item->value, p + start, len); item->value[len] = '\0'; } } pat->ntoks = ntoks; pat->nitems = nitems; return pat; }
/* tag_label: * Label a data file using the current model. This output an almost exact copy * of the input file with an additional column with the predicted label. If * the check option is specified, the input file must be labelled and the * predicted labels will be checked against the provided ones. This will * output error rates during the labelling and detailed statistics per label * at the end. */ void tag_label(mdl_t *mdl, FILE *fin, FILE *fout) { qrk_t *lbls = mdl->reader->lbl; const uint32_t Y = mdl->nlbl; const uint32_t N = mdl->opt->nbest; // We start by preparing the statistic collection to be ready if check // option is used. The stat array hold the following for each label // [0] # of reference with this label // [1] # of token we have taged with this label // [2] # of match of the two preceding uint64_t tcnt = 0, terr = 0; uint64_t scnt = 0, serr = 0; uint64_t stat[3][Y]; for (uint32_t y = 0; y < Y; y++) stat[0][y] = stat[1][y] = stat[2][y] = 0; // Next read the input file sequence by sequence and label them, we have // to take care of not discarding the raw input as we want to send it // back to the output with the additional predicted labels. while (!feof(fin)) { // So, first read an input sequence keeping the raw_t object // available, and label it with Viterbi. raw_t *raw = rdr_readraw(mdl->reader, fin); if (raw == NULL) break; seq_t *seq = rdr_raw2seq(mdl->reader, raw, mdl->opt->check | mdl->opt->force); const uint32_t T = seq->len; uint32_t *out = wapiti_xmalloc(sizeof(uint32_t) * T * N); double *psc = wapiti_xmalloc(sizeof(double ) * T * N); double *scs = wapiti_xmalloc(sizeof(double ) * N); if (N == 1) tag_viterbi(mdl, seq, (uint32_t*)out, scs, (double*)psc); else tag_nbviterbi(mdl, seq, N, (void*)out, scs, (void*)psc); // Next we output the raw sequence with an aditional column for // the predicted labels for (uint32_t n = 0; n < N; n++) { if (mdl->opt->outsc) fprintf(fout, "# %d %f\n", (int)n, scs[n]); for (uint32_t t = 0; t < T; t++) { if (!mdl->opt->label) fprintf(fout, "%s\t", raw->lines[t]); uint32_t lbl = out[t * N + n]; const char *lblstr = qrk_id2str(lbls, lbl); fprintf(fout, "%s", lblstr); if (mdl->opt->outsc) { fprintf(fout, "\t%s", lblstr); fprintf(fout, "/%f", psc[t * N + n]); } fprintf(fout, "\n"); } fprintf(fout, "\n"); } fflush(fout); // If user provided reference labels, use them to collect // statistics about how well we have performed here. Labels // unseen at training time are discarded. if (mdl->opt->check) { bool err = false; for (uint32_t t = 0; t < T; t++) { if (seq->pos[t].lbl == (uint32_t)-1) continue; stat[0][seq->pos[t].lbl]++; stat[1][out[t * N]]++; if (seq->pos[t].lbl != out[t * N]) terr++, err = true; else stat[2][out[t * N]]++; } tcnt += T; serr += err; } // Cleanup memory used for this sequence free(scs); free(psc); free(out); rdr_freeseq(seq); rdr_freeraw(raw); // And report our progress, at regular interval we display how // much sequence are labelled and if possible the current tokens // and sequence error rates. if (++scnt % 1000 == 0) { info("%10"PRIu64" sequences labeled", scnt); if (mdl->opt->check) { const double te = (double)terr / tcnt * 100.0; const double se = (double)serr / scnt * 100.0; info("\t%5.2f%%/%5.2f%%", te, se); } info("\n"); } } // If user have provided reference labels, we have collected a lot of // statistics and we can repport global token and sequence error rate as // well as precision recall and f-measure for each labels. if (mdl->opt->check) { const double te = (double)terr / tcnt * 100.0; const double se = (double)serr / scnt * 100.0; info(" Nb sequences : %"PRIu64"\n", scnt); info(" Token error : %5.2f%%\n", te); info(" Sequence error: %5.2f%%\n", se); info("* Per label statistics\n"); for (uint32_t y = 0; y < Y; y++) { const char *lbl = qrk_id2str(lbls, y); const double Rc = (double)stat[2][y] / stat[0][y]; const double Pr = (double)stat[2][y] / stat[1][y]; const double F1 = 2.0 * (Pr * Rc) / (Pr + Rc); info(" %-6s", lbl); info(" Pr=%.2f", Pr); info(" Rc=%.2f", Rc); info(" F1=%.2f\n", F1); } } }
/* tag_nbviterbi: * This function implement the Viterbi algorithm in order to decode the N-most * probable sequences of labels according to the model. It can be used to * compute only the best one and will return the same sequence than the * previous function but will be slower to do it. */ void tag_nbviterbi(mdl_t *mdl, const seq_t *seq, uint32_t N, uint32_t out[][N], double sc[], double psc[][N]) { const uint32_t Y = mdl->nlbl; const uint32_t T = seq->len; double *vpsi = xvm_new(T * Y * Y); uint32_t *vback = wapiti_xmalloc(sizeof(uint32_t) * T * Y * N); double (*psi) [T][Y ][Y] = (void *)vpsi; uint32_t (*back)[T][Y * N] = (void *)vback; double *cur = wapiti_xmalloc(sizeof(double) * Y * N); double *old = wapiti_xmalloc(sizeof(double) * Y * N); // We first compute the scores for each transitions in the lattice of // labels. int op; if (mdl->type == 1) op = tag_memmsc(mdl, seq, vpsi); else if (mdl->opt->lblpost) op = tag_postsc(mdl, seq, (double *)psi); else op = tag_expsc(mdl, seq, (double *)psi); if (mdl->opt->force) tag_forced(mdl, seq, vpsi, op); // Here also, it's classical but we have to keep the N best paths // leading to each nodes of the lattice instead of only the best one. // This mean that code is less trivial and the current implementation is // not the most efficient way to do this but it works well and is good // enough for the moment. // We first build the list of all incoming arcs from all paths from all // N-best nodes and next select the N-best one. There is a lot of room // here for later optimisations if needed. for (uint32_t y = 0, d = 0; y < Y; y++) { cur[d++] = (*psi)[0][0][y]; for (uint32_t n = 1; n < N; n++) cur[d++] = -DBL_MAX; } for (uint32_t t = 1; t < T; t++) { for (uint32_t d = 0; d < Y * N; d++) old[d] = cur[d]; for (uint32_t y = 0; y < Y; y++) { // 1st, build the list of all incoming double lst[Y * N]; for (uint32_t yp = 0, d = 0; yp < Y; yp++) { for (uint32_t n = 0; n < N; n++, d++) { lst[d] = old[d]; if (op) lst[d] *= (*psi)[t][yp][y]; else lst[d] += (*psi)[t][yp][y]; } } // 2nd, init the back with the N first uint32_t *bk = &(*back)[t][y * N]; for (uint32_t n = 0; n < N; n++) bk[n] = n; // 3rd, search the N highest values for (uint32_t i = N; i < N * Y; i++) { // Search the smallest current value uint32_t idx = 0; for (uint32_t n = 1; n < N; n++) if (lst[bk[n]] < lst[bk[idx]]) idx = n; // And replace it if needed if (lst[i] > lst[bk[idx]]) bk[idx] = i; } // 4th, get the new scores for (uint32_t n = 0; n < N; n++) cur[y * N + n] = lst[bk[n]]; } } // Retrieving the best paths is similar to classical Viterbi except that // we have to search for the N bet ones and there is N time more // possibles starts. for (uint32_t n = 0; n < N; n++) { uint32_t bst = 0; for (uint32_t d = 1; d < Y * N; d++) if (cur[d] > cur[bst]) bst = d; if (sc != NULL) sc[n] = cur[bst]; cur[bst] = -DBL_MAX; for (uint32_t t = T; t > 0; t--) { const uint32_t yp = (t != 1) ? (*back)[t - 1][bst] / N: 0; const uint32_t y = bst / N; out[t - 1][n] = y; if (psc != NULL) psc[t - 1][n] = (*psi)[t - 1][yp][y]; bst = (*back)[t - 1][bst]; } } free(old); free(cur); free(vback); xvm_free(vpsi); }
/* tag_viterbi: * This function implement the Viterbi algorithm in order to decode the most * probable sequence of labels according to the model. Some part of this code * is very similar to the computation of the gradient as expected. * * And like for the gradient, the caller is responsible to ensure there is * enough stack space. */ void tag_viterbi(mdl_t *mdl, const seq_t *seq, uint32_t out[], double *sc, double psc[]) { const uint32_t Y = mdl->nlbl; const uint32_t T = seq->len; double *vpsi = xvm_new(T * Y * Y); uint32_t *vback = wapiti_xmalloc(sizeof(uint32_t) * T * Y); double (*psi) [T][Y][Y] = (void *)vpsi; uint32_t (*back)[T][Y] = (void *)vback; double *cur = wapiti_xmalloc(sizeof(double) * Y); double *old = wapiti_xmalloc(sizeof(double) * Y); // We first compute the scores for each transitions in the lattice of // labels. int op; if (mdl->type == 1) op = tag_memmsc(mdl, seq, vpsi); else if (mdl->opt->lblpost) op = tag_postsc(mdl, seq, vpsi); else op = tag_expsc(mdl, seq, vpsi); if (mdl->opt->force) tag_forced(mdl, seq, vpsi, op); // Now we can do the Viterbi algorithm. This is very similar to the // forward pass // | α_1(y) = Ψ_1(y,x_1) // | α_t(y) = max_{y'} α_{t-1}(y') + Ψ_t(y',y,x_t) // We just replace the sum by a max and as we do the computation in the // logarithmic space the product become a sum. (this also mean that we // don't have to worry about numerical problems) // // Next we have to walk backward over the α in order to find the best // path. In order to do this efficiently, we keep in the 'back' array // the indice of the y value selected by the max. This also mean that // we only need the current and previous value of the α vectors, not // the full matrix. for (uint32_t y = 0; y < Y; y++) cur[y] = (*psi)[0][0][y]; for (uint32_t t = 1; t < T; t++) { for (uint32_t y = 0; y < Y; y++) old[y] = cur[y]; for (uint32_t y = 0; y < Y; y++) { double bst = -HUGE_VAL; uint32_t idx = 0; for (uint32_t yp = 0; yp < Y; yp++) { double val = old[yp]; if (op) val *= (*psi)[t][yp][y]; else val += (*psi)[t][yp][y]; if (val > bst) { bst = val; idx = yp; } } (*back)[t][y] = idx; cur[y] = bst; } } // We can now build the sequence of labels predicted by the model. For // this we search in the last α vector the best value. Using this index // as a starting point in the back-pointer array we finally can decode // the best sequence. uint32_t bst = 0; for (uint32_t y = 1; y < Y; y++) if (cur[y] > cur[bst]) bst = y; if (sc != NULL) *sc = cur[bst]; for (uint32_t t = T; t > 0; t--) { const uint32_t yp = (t != 1) ? (*back)[t - 1][bst] : 0; const uint32_t y = bst; out[t - 1] = y; if (psc != NULL) psc[t - 1] = (*psi)[t - 1][yp][y]; bst = yp; } free(old); free(cur); free(vback); xvm_free(vpsi); }
/* qrk_insert: * Map a key to a uniq identifier. If the key already exist in the map, return * its identifier, else allocate a new identifier and insert the new (key,id) * pair inside the quark. This function is not thread safe and should not be * called on the same map from different thread without locking. */ size_t qrk_str2id(qrk_t *qrk, const char *key) { const uint8_t *raw = (void *)key; const size_t len = strlen(key); // We first take care of the empty trie case so later we can safely // assume that the trie is well formed and so there is no NULL pointers // in it. if (qrk->count == 0) { if (qrk->lock == true) return none; const size_t size = sizeof(char) * (len + 1); leaf_t *lf = wapiti_xmalloc(sizeof(leaf_t) + size); memcpy(lf->key, key, size); lf->id = 0; qrk->root = qrk_lf2nd(lf); qrk->leafs[0] = lf; qrk->count = 1; return 0; } // If the trie is not empty, we first go down the trie to the leaf like // if we are searching for the key. When at leaf there is two case, // either we have found our key or we have found another key with all // its critical bit identical to our one. So we search for the first // differing bit between them to know where we have to add the new node. const node_t *nd = qrk->root; while (!qrk_isleaf(nd)) { const uint8_t chr = nd->pos < len ? raw[nd->pos] : 0; const int side = ((chr | nd->byte) + 1) >> 8; nd = nd->child[side]; } const char *bst = qrk_nd2lf(nd)->key; size_t pos; for (pos = 0; pos < len; pos++) if (key[pos] != bst[pos]) break; uint8_t byte; if (pos != len) byte = key[pos] ^ bst[pos]; else if (bst[pos] != '\0') byte = bst[pos]; else return qrk_nd2lf(nd)->id; if (qrk->lock == true) return none; // Now we known the two key are different and we know in which byte. It // remain to build the mask for the new critical bit and build the new // internal node and leaf. while (byte & (byte - 1)) byte &= byte - 1; byte ^= 255; const uint8_t chr = bst[pos]; const int side = ((chr | byte) + 1) >> 8; const size_t size = sizeof(char) * (len + 1); node_t *nx = wapiti_xmalloc(sizeof(node_t)); leaf_t *lf = wapiti_xmalloc(sizeof(leaf_t) + size); memcpy(lf->key, key, size); lf->id = qrk->count++; nx->pos = pos; nx->byte = byte; nx->child[1 - side] = qrk_lf2nd(lf); if (lf->id == qrk->size) { qrk->size *= 1.4; const size_t size = sizeof(leaf_t *) * qrk->size; qrk->leafs = wapiti_xrealloc(qrk->leafs, size); } qrk->leafs[lf->id] = lf; // And last thing to do: inserting the new node in the trie. We have to // walk down the trie again as we have to keep the ordering of nodes. So // we search for the good position to insert it. node_t **trg = &qrk->root; while (true) { node_t *nd = *trg; if (qrk_isleaf(nd) || nd->pos > pos) break; if (nd->pos == pos && nd->byte > byte) break; const uint8_t chr = nd->pos < len ? raw[nd->pos] : 0; const int side = ((chr | nd->byte) + 1) >> 8; trg = &nd->child[side]; } nx->child[side] = *trg; *trg = nx; return lf->id; }