/* tag_label: * Label a data file using the current model. This output an almost exact copy * of the input file with an additional column with the predicted label. If * the check option is specified, the input file must be labelled and the * predicted labels will be checked against the provided ones. This will * output error rates during the labelling and detailed statistics per label * at the end. */ int tag_label(mdl_t *mdl, FILE *fin, FILE *fout) { qrk_t *lbls = mdl->reader->lbl; const uint32_t Y = mdl->nlbl; const uint32_t N = mdl->opt->nbest; // We start by preparing the statistic collection to be ready if check // option is used. The stat array hold the following for each label // [0] # of reference with this label // [1] # of token we have taged with this label // [2] # of match of the two preceding uint64_t tcnt = 0, terr = 0; uint64_t scnt = 0, serr = 0; uint64_t stat[3][Y]; for (uint32_t y = 0; y < Y; y++) stat[0][y] = stat[1][y] = stat[2][y] = 0; // Next read the input file sequence by sequence and label them, we have // to take care of not discarding the raw input as we want to send it // back to the output with the additional predicted labels. while (!feof(fin)) { // So, first read an input sequence keeping the raw_t object // available, and label it with Viterbi. raw_t *raw = rdr_readraw(mdl->reader, fin); if (raw == NULL) break; seq_t *seq = rdr_raw2seq(mdl->reader, raw, mdl->opt->check | mdl->opt->force); if (seq == NULL) { rdr_freeraw(raw); return 0; } const uint32_t T = seq->len; uint32_t *out = xmalloc(sizeof(uint32_t) * T * N); double *psc = xmalloc(sizeof(double ) * T * N); double *scs = xmalloc(sizeof(double ) * N); if (N == 1) tag_viterbi(mdl, seq, (uint32_t*)out, scs, (double*)psc); else tag_nbviterbi(mdl, seq, N, (void*)out, scs, (void*)psc); // Next we output the raw sequence with an aditional column for // the predicted labels for (uint32_t n = 0; n < N; n++) { if (mdl->opt->outsc) fprintf(fout, "# %d %f\n", (int)n, scs[n]); for (uint32_t t = 0; t < T; t++) { if (!mdl->opt->label) fprintf(fout, "%s\t", raw->lines[t]); uint32_t lbl = out[t * N + n]; const char *lblstr = qrk_id2str(lbls, lbl); fprintf(fout, "%s", lblstr); if (mdl->opt->outsc) { fprintf(fout, "\t%s", lblstr); fprintf(fout, "/%f", psc[t * N + n]); } fprintf(fout, "\n"); } fprintf(fout, "\n"); } fflush(fout); // If user provided reference labels, use them to collect // statistics about how well we have performed here. Labels // unseen at training time are discarded. if (mdl->opt->check) { bool err = false; for (uint32_t t = 0; t < T; t++) { if (seq->pos[t].lbl == (uint32_t)-1) continue; stat[0][seq->pos[t].lbl]++; stat[1][out[t * N]]++; if (seq->pos[t].lbl != out[t * N]) terr++, err = true; else stat[2][out[t * N]]++; } tcnt += T; serr += err; } // Cleanup memory used for this sequence free(scs); free(psc); free(out); rdr_freeseq(seq); rdr_freeraw(raw); // And report our progress, at regular interval we display how // much sequence are labelled and if possible the current tokens // and sequence error rates. if (++scnt % 1000 == 0) { info("%10"PRIu64" sequences labeled", scnt); if (mdl->opt->check) { const double te = (double)terr / tcnt * 100.0; const double se = (double)serr / scnt * 100.0; info("\t%5.2f%%/%5.2f%%", te, se); } info("\n"); } } // If user have provided reference labels, we have collected a lot of // statistics and we can repport global token and sequence error rate as // well as precision recall and f-measure for each labels. if (mdl->opt->check) { const double te = (double)terr / tcnt * 100.0; const double se = (double)serr / scnt * 100.0; info(" Nb sequences : %"PRIu64"\n", scnt); info(" Token error : %5.2f%%\n", te); info(" Sequence error: %5.2f%%\n", se); info("* Per label statistics\n"); for (uint32_t y = 0; y < Y; y++) { const char *lbl = qrk_id2str(lbls, y); const double Rc = (double)stat[2][y] / stat[0][y]; const double Pr = (double)stat[2][y] / stat[1][y]; const double F1 = 2.0 * (Pr * Rc) / (Pr + Rc); info(" %-6s", lbl); info(" Pr=%.2f", Pr); info(" Rc=%.2f", Rc); info(" F1=%.2f\n", F1); } } return 1; }
/* rdr_freedat: * Free all memory used by a dat_t object. */ void rdr_freedat(dat_t *dat) { for (uint32_t i = 0; i < dat->nseq; i++) rdr_freeseq(dat->seq[i]); free(dat->seq); free(dat); }