static cellpoint read_sexp(void) { return read_parse(ibuffer, &scan); }
int read_sentence(corpusflags_type *flags, FILE *in, sentence_type *s, feature_type *fmax, int *maxnparses) { int i, nread, best_fscore_index = -1, nwinners = 0; DataFloat fscore, best_logprob = -DATAFLOAT_MAX, best_fscore = -1; nread = fscanf(in, " G = " DATAFLOAT_FORMAT " ", &s->g); if (nread == EOF) return EOF; assert(nread == 0 || nread == 1); if (nread == 0) s->g = 1; nread = fscanf(in, " N = %d", &s->nparses); if (nread == EOF) return EOF; assert(nread == 1); assert(s->nparses >= 0); if (s->nparses > *maxnparses) *maxnparses = s->nparses; if (s->nparses > 0) { s->parse = SMALLOC(s->nparses*sizeof(parse_type)); assert(s->parse != NULL); } else s->parse = NULL; for (i = 0; i < s->nparses; ++i) { read_parse(in, &s->parse[i], fmax); // handle the admittedly strange case where a parse has no brackets if (s->parse[i].w == 0.0 || s->parse[i].p == 0.0) { fscore = 0.0; } else { fscore = 2 * s->parse[i].w/(s->parse[i].p+s->g); } if (fscore+DATAFLOAT_EPS >= best_fscore) { Float logprob = feature_value(&s->parse[i], 0); // logprob is feature 0 if (fabs(fscore-best_fscore) < 2*DATAFLOAT_EPS) { ++nwinners; // tied best f-scores if (logprob > best_logprob) { // pick candidate with best logprob best_fscore = fscore; best_logprob = logprob; best_fscore_index = i; } } else { best_fscore = fscore; best_logprob = logprob; best_fscore_index = i; nwinners = 1; } } } // skip sentences where the best f-score was 0 since there's nothing to learn from them if (best_fscore == 0.0) { best_fscore = -1; best_logprob = -DATAFLOAT_MAX; best_fscore_index = -1; nwinners = 0; } if (best_fscore_index >= 0) { /* is there a winner? */ Float sum_Pyx = 0; assert(nwinners > 0); assert(best_fscore_index < s->nparses); s->Px = 1.0; /* indicate that there is a winner */ s->correct_index = best_fscore_index; if (flags && flags->Pyx_factor > 1) { Float Z = 0; for (i = 0; i < s->nparses; ++i) { fscore = 2 * s->parse[i].w/(s->parse[i].p+s->g); Z += pow(flags->Pyx_factor, fscore - best_fscore); } for (i = 0; i < s->nparses; ++i) { fscore = 2 * s->parse[i].w/(s->parse[i].p+s->g); sum_Pyx += s->parse[i].Pyx = pow(flags->Pyx_factor, fscore - best_fscore) / Z; } } else if (flags && flags->Pyx_factor > 0) { /* Pyx_factor == 1; all winners get equal Pyx */ for (i = 0; i < s->nparses; ++i) { Float fscore = 2 * s->parse[i].w/(s->parse[i].p+s->g); if (fabs(best_fscore-fscore) < 2*DATAFLOAT_EPS) sum_Pyx += s->parse[i].Pyx = 1.0/nwinners; else sum_Pyx += s->parse[i].Pyx = 0; } } else { /* Pyx_factor == 0; f-score winner gets Pyx = 1, all others 0 */ for (i = 0; i < s->nparses; ++i) sum_Pyx += s->parse[i].Pyx = (i == best_fscore_index); } assert(fabs(sum_Pyx - 1) <= DATAFLOAT_EPS); } else { /* no winner, set Px to 0.0 to indicate this */ s->Px = 0.0; s->correct_index = s->nparses; for (i = 0; i < s->nparses; ++i) s->parse[i].Pyx = 0.0; /* We used to ensure that s->nparses == 0 here, but instead do nothing since read_corpus now knows to skip these sentences. These sentences show up when we have parse failures and the parser returns a flat bracketing. */ } return s->nparses; } /* read_sentence() */
int read_sentence(corpusflags_type *flags, FILE *in, sentence_type *s, feature_type *fmax, int *maxnparses) { int i, nread, best_fscore_index = -1, nwinners = 0; DataFloat fscore, best_logprob = -DATAFLOAT_MAX, best_fscore = -1; nread = fscanf(in, " G = " DATAFLOAT_FORMAT " ", &s->g); if (nread == EOF) return EOF; assert(nread == 0 || nread == 1); if (nread == 0) s->g = 1; nread = fscanf(in, " N = %u", &s->nparses); if (nread == EOF) return EOF; assert(nread == 1); assert(s->nparses >= 0); if (s->nparses > *maxnparses) *maxnparses = s->nparses; if (s->nparses > 0) { s->parse = SMALLOC(s->nparses*sizeof(parse_type)); assert(s->parse != NULL); } else s->parse = NULL; for (i = 0; i < s->nparses; ++i) { read_parse(in, &s->parse[i], fmax); fscore = 2 * s->parse[i].w/(s->parse[i].p+s->g); if (fscore+DATAFLOAT_EPS >= best_fscore) { Float logprob = feature_value(&s->parse[i], 0); // logprob is feature 0 if (fabs(fscore-best_fscore) < 2*DATAFLOAT_EPS) { ++nwinners; // tied best f-scores if (logprob > best_logprob) { // pick candidate with best logprob best_fscore = fscore; best_logprob = logprob; best_fscore_index = i; } } else { best_fscore = fscore; best_logprob = logprob; best_fscore_index = i; nwinners = 1; } } } if (best_fscore_index >= 0) { /* is there a winner? */ Float sum_Pyx = 0; assert(nwinners > 0); s->Px = 1.0; /* indicate that there is a winner */ if (flags->Pyx_factor > 1) { Float Z = 0; for (i = 0; i < s->nparses; ++i) { fscore = 2 * s->parse[i].w/(s->parse[i].p+s->g); Z += pow(flags->Pyx_factor, fscore - best_fscore); } for (i = 0; i < s->nparses; ++i) { fscore = 2 * s->parse[i].w/(s->parse[i].p+s->g); sum_Pyx += s->parse[i].Pyx = pow(flags->Pyx_factor, fscore - best_fscore) / Z; } } else if (flags->Pyx_factor > 0) { /* Pyx_factor == 1; all winners get equal Pyx */ for (i = 0; i < s->nparses; ++i) { Float fscore = 2 * s->parse[i].w/(s->parse[i].p+s->g); if (fabs(best_fscore-fscore) < 2*DATAFLOAT_EPS) sum_Pyx += s->parse[i].Pyx = 1.0/nwinners; else sum_Pyx += s->parse[i].Pyx = 0; } } else { /* Pyx_factor == 0; f-score winner gets Pyx = 1, all others 0 */ for (i = 0; i < s->nparses; ++i) sum_Pyx += s->parse[i].Pyx = (i == best_fscore_index); } assert(fabs(sum_Pyx - 1) <= DATAFLOAT_EPS); } else { /* no winner, set Px to 0.0 to indicate this */ s->Px = 0.0; for (i = 0; i < s->nparses; ++i) s->parse[i].Pyx = 0.0; } return s->nparses; } /* read_sentence() */