Exemple #1
0
/* tag_label:
 *   Label a data file using the current model. This output an almost exact copy
 *   of the input file with an additional column with the predicted label. If
 *   the check option is specified, the input file must be labelled and the
 *   predicted labels will be checked against the provided ones. This will
 *   output error rates during the labelling and detailed statistics per label
 *   at the end.
 */
int tag_label(mdl_t *mdl, FILE *fin, FILE *fout) {
	qrk_t *lbls = mdl->reader->lbl;
	const uint32_t Y = mdl->nlbl;
	const uint32_t N = mdl->opt->nbest;
	// We start by preparing the statistic collection to be ready if check
	// option is used. The stat array hold the following for each label
	//   [0] # of reference with this label
	//   [1] # of token we have taged with this label
	//   [2] # of match of the two preceding
	uint64_t tcnt = 0, terr = 0;
	uint64_t scnt = 0, serr = 0;
	uint64_t stat[3][Y];
	for (uint32_t y = 0; y < Y; y++)
		stat[0][y] = stat[1][y] = stat[2][y] = 0;
	// Next read the input file sequence by sequence and label them, we have
	// to take care of not discarding the raw input as we want to send it
	// back to the output with the additional predicted labels.
	while (!feof(fin)) {
		// So, first read an input sequence keeping the raw_t object
		// available, and label it with Viterbi.
		raw_t *raw = rdr_readraw(mdl->reader, fin);
		if (raw == NULL)
			break;
		seq_t *seq = rdr_raw2seq(mdl->reader, raw,
			mdl->opt->check | mdl->opt->force);
        if (seq == NULL) {
            rdr_freeraw(raw);
            return 0;
        }
		const uint32_t T = seq->len;
		uint32_t *out = xmalloc(sizeof(uint32_t) * T * N);
		double   *psc = xmalloc(sizeof(double  ) * T * N);
		double   *scs = xmalloc(sizeof(double  ) * N);
		if (N == 1)
			tag_viterbi(mdl, seq, (uint32_t*)out, scs, (double*)psc);
		else
			tag_nbviterbi(mdl, seq, N, (void*)out, scs, (void*)psc);
		// Next we output the raw sequence with an aditional column for
		// the predicted labels
		for (uint32_t n = 0; n < N; n++) {
			if (mdl->opt->outsc)
				fprintf(fout, "# %d %f\n", (int)n, scs[n]);
			for (uint32_t t = 0; t < T; t++) {
				if (!mdl->opt->label)
					fprintf(fout, "%s\t", raw->lines[t]);
				uint32_t lbl = out[t * N + n];
				const char *lblstr = qrk_id2str(lbls, lbl);
				fprintf(fout, "%s", lblstr);
				if (mdl->opt->outsc) {
					fprintf(fout, "\t%s", lblstr);
					fprintf(fout, "/%f", psc[t * N + n]);
				}
				fprintf(fout, "\n");
			}
			fprintf(fout, "\n");
		}
		fflush(fout);
		// If user provided reference labels, use them to collect
		// statistics about how well we have performed here. Labels
		// unseen at training time are discarded.
		if (mdl->opt->check) {
			bool err = false;
			for (uint32_t t = 0; t < T; t++) {
				if (seq->pos[t].lbl == (uint32_t)-1)
					continue;
				stat[0][seq->pos[t].lbl]++;
				stat[1][out[t * N]]++;
				if (seq->pos[t].lbl != out[t * N])
					terr++, err = true;
				else
					stat[2][out[t * N]]++;
			}
			tcnt += T;
			serr += err;
		}
		// Cleanup memory used for this sequence
		free(scs);
		free(psc);
		free(out);
		rdr_freeseq(seq);
		rdr_freeraw(raw);
		// And report our progress, at regular interval we display how
		// much sequence are labelled and if possible the current tokens
		// and sequence error rates.
		if (++scnt % 1000 == 0) {
			info("%10"PRIu64" sequences labeled", scnt);
			if (mdl->opt->check) {
				const double te = (double)terr  / tcnt * 100.0;
				const double se = (double)serr  / scnt * 100.0;
				info("\t%5.2f%%/%5.2f%%", te, se);
			}
			info("\n");
		}
	}
	// If user have provided reference labels, we have collected a lot of
	// statistics and we can repport global token and sequence error rate as
	// well as precision recall and f-measure for each labels.
	if (mdl->opt->check) {
		const double te = (double)terr  / tcnt * 100.0;
		const double se = (double)serr  / scnt * 100.0;
		info("    Nb sequences  : %"PRIu64"\n", scnt);
		info("    Token error   : %5.2f%%\n", te);
		info("    Sequence error: %5.2f%%\n", se);
		info("* Per label statistics\n");
		for (uint32_t y = 0; y < Y; y++) {
			const char   *lbl = qrk_id2str(lbls, y);
			const double  Rc  = (double)stat[2][y] / stat[0][y];
			const double  Pr  = (double)stat[2][y] / stat[1][y];
			const double  F1  = 2.0 * (Pr * Rc) / (Pr + Rc);
			info("    %-6s", lbl);
			info("  Pr=%.2f", Pr);
			info("  Rc=%.2f", Rc);
			info("  F1=%.2f\n", F1);
		}
	}
    
    return 1;
}
Exemple #2
0
/* rdr_freedat:
 *   Free all memory used by a dat_t object.
 */
void rdr_freedat(dat_t *dat) {
	for (uint32_t i = 0; i < dat->nseq; i++)
		rdr_freeseq(dat->seq[i]);
	free(dat->seq);
	free(dat);
}