int main(int argc, char **argv) { si_t si = make_si(1024); FILE *grammarfp = stdin, *yieldfp; FILE *tracefp = NULL; /* trace output */ FILE *summaryfp = stderr; /* end of parse stats output */ FILE *parsefp = stdout; /* parse trees */ FILE *probfp = NULL; /* max_neglog_prob */ chart_cell root_cell; grammar g; chart c; vindex terms; int maxsentlen = 0; int sentenceno = 0, parsed_sentences = 0, failed_sentences = 0; double sum_neglog_prob = 0; int sentfrom = 0; int sentto = 0; srand(RAND_SEED); /* seed random number generator */ if (argc<2 || argc>6) { fprintf(stderr, "%s yieldfile [maxsentlen [grammarfile [sentfrom sentto]]]\n", argv[0]); exit(EXIT_FAILURE); } if ((yieldfp = fopen(argv[1], "r")) == NULL) { fprintf(stderr, "%s: Couldn't open yieldfile %s\n", argv[0], argv[1]); exit(EXIT_FAILURE); } if (argc >= 3) if (!sscanf(argv[2], "%d", &maxsentlen)) { fprintf(stderr, "%s: Couldn't parse maxsentlen %s\n", argv[0], argv[2]); exit(EXIT_FAILURE); } if (argc >= 4) if ((grammarfp = fopen(argv[3], "r")) == NULL) { fprintf(stderr, "%s: Couldn't open grammarfile %s\n", argv[0], argv[3]); exit(EXIT_FAILURE); } if (argc >= 6) { if (!sscanf(argv[4], "%d", &sentfrom)) { fprintf(stderr, "%s: Couldn't parse sentfrom %s\n", argv[0], argv[4]); exit(EXIT_FAILURE); } if (!sscanf(argv[5], "%d", &sentto)) { fprintf(stderr, "%s: Couldn't parse sentto %s\n", argv[0], argv[5]); exit(EXIT_FAILURE); } } g = read_grammar(grammarfp, si); /* write_grammar(tracefp, g, si); */ while ((terms = read_terms(yieldfp, si))) { sentenceno++; if (sentfrom && sentenceno < sentfrom) { vindex_free(terms); continue; } if (sentto && sentenceno > sentto) { vindex_free(terms); break; } /* skip if sentence is too long */ if (!maxsentlen || (int) terms->n <= maxsentlen) { size_t i; if (tracefp) { fprintf(tracefp, "\nSentence %d:\n", sentenceno); for (i=0; i<terms->n; i++) fprintf(tracefp, " %s", si_index_string(si, terms->e[i])); fprintf(tracefp, "\n"); } c = cky(*terms, g, si); /* fetch best root node */ root_cell = sihashcc_ref(CHART_ENTRY(c, 0, terms->n), g.root_label); if (root_cell) { tree parse_tree = bintree_tree(&root_cell->tree, si); double prob = (double) root_cell->prob; parsed_sentences++; assert(prob > 0.0); sum_neglog_prob -= log(prob); if (probfp) fprintf(probfp, "max_neglog_prob(%d, %g).\n", sentenceno, -log(prob)); if (tracefp) fprintf(tracefp, " Prob = %g\n", prob); if (parsefp) { write_tree(parsefp, parse_tree, si); fprintf(parsefp, "\n"); /* write_prolog_tree(parsefp, parse_tree, si); */ } free_tree(parse_tree); } else { failed_sentences++; if (tracefp) fprintf(tracefp, "Failed to parse\n"); if (parsefp) fprintf(parsefp, "parse_failure.\n"); } chart_free(c, terms->n); /* free the chart */ } else { /* sentence too long */ if (parsefp) fprintf(parsefp, "too_long.\n"); } vindex_free(terms); /* free the terms */ assert(trees_allocated == 0); assert(bintrees_allocated == 0); } free_grammar(g); si_free(si); if (summaryfp) { fprintf(summaryfp, "\n%d/%d = %g%% test sentences met the length criteron," " of which %d/%d = %g%% were parsed\n", parsed_sentences+failed_sentences, sentenceno, (double) (100.0 * (parsed_sentences+failed_sentences)) / sentenceno, parsed_sentences, parsed_sentences+failed_sentences, (double) (100.0 * parsed_sentences) / (parsed_sentences + failed_sentences)); fprintf(summaryfp, "Sum(-log prob) = %g\n", sum_neglog_prob); } /* check that everything has been deallocated */ /* printf("mmm_blocks_allocated = %ld\n", (long) mmm_blocks_allocated); */ assert(mmm_blocks_allocated == 0); exit(EXIT_SUCCESS); }
FLOAT expected_rule_counts(const grammar g, const si_t si, FILE *yieldfp, FILE *tracefp, FILE *summaryfp, int debuglevel, int maxsentlen, FLOAT minruleprob, FLOAT wordscale, FLOAT *rule_counts, FLOAT *sum_yieldweights, int weighted_yields_flag) { vindex terms; FLOAT root_prob; chart inside, outside; long sentenceno = 0, parsed_sentences = 0, failed_sentences = 0; double sum_neglog_prob = 0.0; float yieldweight; *sum_yieldweights = 0; /* FLOAT *rule_counts = CALLOC(g->nrules, sizeof(FLOAT)); */ { size_t i; /* zero rule counts */ for (i=0; i<g->nrules; i++) rule_counts[i] = 0.0; } compute_unary_closure(g, minruleprob); /* compute unary_close */ rewind(yieldfp); /* rewind the tree file */ while ((terms = read_terms(weighted_yields_flag, yieldfp, si, &yieldweight))) { sentenceno++; if (summaryfp && debuglevel >= 10000) { size_t i; fprintf(tracefp, "\nSentence %ld:\n", sentenceno); for (i=0; i<terms->n; i++) fprintf(tracefp, " %s", si_index_string(si, terms->e[i])); fprintf(tracefp, "\n"); } /* skip if sentence is too long */ if (!maxsentlen || (int) terms->n <= maxsentlen) { inside = inside_chart(terms, g, si, wordscale); /* chart_display(stdout, inside, terms->n, si); */ root_prob = sihashf_ref(CHART_ENTRY(inside, 0, terms->n), g->root_label); if (root_prob > 0.0) { if (tracefp && debuglevel >= 10000) fprintf(tracefp, "Sum of derivation weights = %g\n", root_prob); sum_neglog_prob -= yieldweight*(log(root_prob)-terms->n*log(wordscale)); *sum_yieldweights += yieldweight*terms->n; parsed_sentences++; outside = outside_chart(g, si, inside, terms, yieldweight, rule_counts); /* assert(consistent_preterm_outsides(outside, terms, root_prob)); */ /* chart_display(stdout, outside, terms->n, si); */ chart_free(outside, terms->n); } else { failed_sentences++; if (tracefp && debuglevel >= 10000) fprintf(tracefp, "Failed to parse.\n"); } chart_free(inside, terms->n); /* free the chart */ } else { /* sentence too long */ if (tracefp && debuglevel >= 10000) fprintf(tracefp, "Too long to parse.\n"); } vindex_free(terms); /* and its terms */ } /* free unary closure */ free_unary_closure(g); if (summaryfp && debuglevel >= 1000) { if (failed_sentences>0) fprintf(summaryfp, " %ld sentences failed to parse", (long) failed_sentences); } return(sum_neglog_prob); }