void write_local_trees(const vihashl localtree_ht, si_t si, si_index root) { vihashlit hit; vindex vi; long count; size_t i; char *string; for (hit = vihashlit_init(localtree_ht); vihashlit_ok(hit); hit = vihashlit_next(hit)) { vi = (vindex) hit.key; assert(vi->n > 0); assert(vi->n <= MAXRHS); if (vi->e[0] != root) continue; count = hit.value; string = si_index_string(si, vi->e[0]); assert(string); printf("%ld\t%s " REWRITES, count, string); for (i=1; i<vi->n; i++) { string = si_index_string(si, vi->e[i]); assert(string); printf(" %s", string); } printf("\n"); } for (hit = vihashlit_init(localtree_ht); vihashlit_ok(hit); hit = vihashlit_next(hit)) { vi = (vindex) hit.key; assert(vi->n > 0); assert(vi->n <= MAXRHS); if (vi->e[0] == root) continue; count = hit.value; string = si_index_string(si, vi->e[0]); assert(string); printf("%ld\t%s " REWRITES, count, string); for (i=1; i<vi->n; i++) { string = si_index_string(si, vi->e[i]); assert(string); printf(" %s", string); } printf("\n"); } }
static void chart_entry_display(FILE *fp, sihashf chart_entry, si_t si) { sihashfit hit; for (hit=sihashfit_init(chart_entry); sihashfit_ok(hit); hit = sihashfit_next(hit)) fprintf(fp, " %s: %g\n", si_index_string(si, hit.key), (float) hit.value); }
void write_grammar(FILE *fp, grammar g, si_t si) { sihashbrsit bhit; sihashursit uhit; size_t i; for (bhit=sihashbrsit_init(g.brs); sihashbrsit_ok(bhit); bhit=sihashbrsit_next(bhit)) for (i=0; i<bhit.value.n; i++) fprintf(fp, "%g %s " REWRITES " %s %s\n", (double) bhit.value.e[i]->prob, si_index_string(si, bhit.value.e[i]->parent), si_index_string(si, bhit.value.e[i]->left), si_index_string(si, bhit.value.e[i]->right)); for (uhit=sihashursit_init(g.urs); sihashursit_ok(uhit); uhit=sihashursit_next(uhit)) for (i=0; i<uhit.value.n; i++) fprintf(fp, "%g %s " REWRITES " %s\n", (double) uhit.value.e[i]->prob, si_index_string(si, uhit.value.e[i]->parent), si_index_string(si, uhit.value.e[i]->child)); }
int main(int argc, char **argv) { si_t si = make_si(1024); FILE *grammarfp = stdin, *yieldfp; FILE *tracefp = NULL; /* trace output */ FILE *summaryfp = stderr; /* end of parse stats output */ FILE *parsefp = stdout; /* parse trees */ FILE *probfp = NULL; /* max_neglog_prob */ chart_cell root_cell; grammar g; chart c; vindex terms; int maxsentlen = 0; int sentenceno = 0, parsed_sentences = 0, failed_sentences = 0; double sum_neglog_prob = 0; int sentfrom = 0; int sentto = 0; srand(RAND_SEED); /* seed random number generator */ if (argc<2 || argc>6) { fprintf(stderr, "%s yieldfile [maxsentlen [grammarfile [sentfrom sentto]]]\n", argv[0]); exit(EXIT_FAILURE); } if ((yieldfp = fopen(argv[1], "r")) == NULL) { fprintf(stderr, "%s: Couldn't open yieldfile %s\n", argv[0], argv[1]); exit(EXIT_FAILURE); } if (argc >= 3) if (!sscanf(argv[2], "%d", &maxsentlen)) { fprintf(stderr, "%s: Couldn't parse maxsentlen %s\n", argv[0], argv[2]); exit(EXIT_FAILURE); } if (argc >= 4) if ((grammarfp = fopen(argv[3], "r")) == NULL) { fprintf(stderr, "%s: Couldn't open grammarfile %s\n", argv[0], argv[3]); exit(EXIT_FAILURE); } if (argc >= 6) { if (!sscanf(argv[4], "%d", &sentfrom)) { fprintf(stderr, "%s: Couldn't parse sentfrom %s\n", argv[0], argv[4]); exit(EXIT_FAILURE); } if (!sscanf(argv[5], "%d", &sentto)) { fprintf(stderr, "%s: Couldn't parse sentto %s\n", argv[0], argv[5]); exit(EXIT_FAILURE); } } g = read_grammar(grammarfp, si); /* write_grammar(tracefp, g, si); */ while ((terms = read_terms(yieldfp, si))) { sentenceno++; if (sentfrom && sentenceno < sentfrom) { vindex_free(terms); continue; } if (sentto && sentenceno > sentto) { vindex_free(terms); break; } /* skip if sentence is too long */ if (!maxsentlen || (int) terms->n <= maxsentlen) { size_t i; if (tracefp) { fprintf(tracefp, "\nSentence %d:\n", sentenceno); for (i=0; i<terms->n; i++) fprintf(tracefp, " %s", si_index_string(si, terms->e[i])); fprintf(tracefp, "\n"); } c = cky(*terms, g, si); /* fetch best root node */ root_cell = sihashcc_ref(CHART_ENTRY(c, 0, terms->n), g.root_label); if (root_cell) { tree parse_tree = bintree_tree(&root_cell->tree, si); double prob = (double) root_cell->prob; parsed_sentences++; assert(prob > 0.0); sum_neglog_prob -= log(prob); if (probfp) fprintf(probfp, "max_neglog_prob(%d, %g).\n", sentenceno, -log(prob)); if (tracefp) fprintf(tracefp, " Prob = %g\n", prob); if (parsefp) { write_tree(parsefp, parse_tree, si); fprintf(parsefp, "\n"); /* write_prolog_tree(parsefp, parse_tree, si); */ } free_tree(parse_tree); } else { failed_sentences++; if (tracefp) fprintf(tracefp, "Failed to parse\n"); if (parsefp) fprintf(parsefp, "parse_failure.\n"); } chart_free(c, terms->n); /* free the chart */ } else { /* sentence too long */ if (parsefp) fprintf(parsefp, "too_long.\n"); } vindex_free(terms); /* free the terms */ assert(trees_allocated == 0); assert(bintrees_allocated == 0); } free_grammar(g); si_free(si); if (summaryfp) { fprintf(summaryfp, "\n%d/%d = %g%% test sentences met the length criteron," " of which %d/%d = %g%% were parsed\n", parsed_sentences+failed_sentences, sentenceno, (double) (100.0 * (parsed_sentences+failed_sentences)) / sentenceno, parsed_sentences, parsed_sentences+failed_sentences, (double) (100.0 * parsed_sentences) / (parsed_sentences + failed_sentences)); fprintf(summaryfp, "Sum(-log prob) = %g\n", sum_neglog_prob); } /* check that everything has been deallocated */ /* printf("mmm_blocks_allocated = %ld\n", (long) mmm_blocks_allocated); */ assert(mmm_blocks_allocated == 0); exit(EXIT_SUCCESS); }
FLOAT expected_rule_counts(const grammar g, const si_t si, FILE *yieldfp, FILE *tracefp, FILE *summaryfp, int debuglevel, int maxsentlen, FLOAT minruleprob, FLOAT wordscale, FLOAT *rule_counts, FLOAT *sum_yieldweights, int weighted_yields_flag) { vindex terms; FLOAT root_prob; chart inside, outside; long sentenceno = 0, parsed_sentences = 0, failed_sentences = 0; double sum_neglog_prob = 0.0; float yieldweight; *sum_yieldweights = 0; /* FLOAT *rule_counts = CALLOC(g->nrules, sizeof(FLOAT)); */ { size_t i; /* zero rule counts */ for (i=0; i<g->nrules; i++) rule_counts[i] = 0.0; } compute_unary_closure(g, minruleprob); /* compute unary_close */ rewind(yieldfp); /* rewind the tree file */ while ((terms = read_terms(weighted_yields_flag, yieldfp, si, &yieldweight))) { sentenceno++; if (summaryfp && debuglevel >= 10000) { size_t i; fprintf(tracefp, "\nSentence %ld:\n", sentenceno); for (i=0; i<terms->n; i++) fprintf(tracefp, " %s", si_index_string(si, terms->e[i])); fprintf(tracefp, "\n"); } /* skip if sentence is too long */ if (!maxsentlen || (int) terms->n <= maxsentlen) { inside = inside_chart(terms, g, si, wordscale); /* chart_display(stdout, inside, terms->n, si); */ root_prob = sihashf_ref(CHART_ENTRY(inside, 0, terms->n), g->root_label); if (root_prob > 0.0) { if (tracefp && debuglevel >= 10000) fprintf(tracefp, "Sum of derivation weights = %g\n", root_prob); sum_neglog_prob -= yieldweight*(log(root_prob)-terms->n*log(wordscale)); *sum_yieldweights += yieldweight*terms->n; parsed_sentences++; outside = outside_chart(g, si, inside, terms, yieldweight, rule_counts); /* assert(consistent_preterm_outsides(outside, terms, root_prob)); */ /* chart_display(stdout, outside, terms->n, si); */ chart_free(outside, terms->n); } else { failed_sentences++; if (tracefp && debuglevel >= 10000) fprintf(tracefp, "Failed to parse.\n"); } chart_free(inside, terms->n); /* free the chart */ } else { /* sentence too long */ if (tracefp && debuglevel >= 10000) fprintf(tracefp, "Too long to parse.\n"); } vindex_free(terms); /* and its terms */ } /* free unary closure */ free_unary_closure(g); if (summaryfp && debuglevel >= 1000) { if (failed_sentences>0) fprintf(summaryfp, " %ld sentences failed to parse", (long) failed_sentences); } return(sum_neglog_prob); }
static chart inside_chart(vindex terms, grammar g, si_t si, FLOAT wordscale) { int left, right, mid; chart c = chart_make(terms->n); /* parent_completes is a sihashf of completed categories (i.e., not the * new, active categories produced by binarization). Unary closure * applies to these categories. * The pre-unary-closure parent weights are stored in parent_completes * before unary closure is applied to them */ /* Inside pass */ /* insert lexical items */ for (left=0; left< (int) terms->n; left++) { rulelist rl; si_index terminal = terms->e[left]; sihashf chart_entry = make_sihashf(NLABELS); CHART_ENTRY(c, left, left+1) = chart_entry; assert(terminal>0); if (terminal<=g->nnts) { fprintf(stderr, "Error in inside_chart() in expected-counts.c: " "input contains nonterminal symbol %s\n", si_index_string(si, terminal)); exit(EXIT_FAILURE); } if (terminal>g->ncats) { fprintf(stderr, "Error in inside_chart() in expected-counts.c:" " input contains unknown terminal %s\n", si_index_string(si, terminal)); exit(EXIT_FAILURE); } /* no need to actually enter terminal into chart */ /* sihashf_set(chart_entry, terminal, 1.0); */ rl = g->urules[terminal]; assert(rl); /* check there are rules for this terminal */ for ( ; rl; rl=rl->next) { si_index preterminal = g->rules[rl->ruleid]->e[0]; FLOAT preterminal_prob = g->weights[rl->ruleid]*wordscale; catproblist pp; /* assert(rl->ruleid<g->nrules); */ assert(g->child_parentprob[preterminal]); for (pp = g->child_parentprob[preterminal]; pp; pp = pp->next) sihashf_inc(chart_entry, pp->cat, preterminal_prob*pp->prob); } /* fprintf(stderr, "Chart entry %d-%d\n", (int) left, (int) left+1); chart_entry_display(stderr, chart_entry, si); */ } for (right=2; right<= (int) terms->n; right++) for (left=right-2; left>=0; left--) { sihashf parent_completes = make_sihashf(COMPLETE_CELLS); sihashf chart_entry = make_sihashf(CHART_CELLS); CHART_ENTRY(c, left, right) = chart_entry; for (mid=left+1; mid<right; mid++) binary_inside(g, chart_entry, parent_completes, CHART_ENTRY(c,left,mid), CHART_ENTRY(c,mid,right)); unary_closure_inside(g, chart_entry, parent_completes); free_sihashf(parent_completes); /* fprintf(stdout, "Chart entry %d-%d\n", (int) left, (int) right); */ /* chart_entry_display(stdout, CHART_ENTRY(inside,left,right), si); */ } return c; }
grammar read_grammar(FILE *fp, si_t si) { sihashbrs left_brules_ht = make_sihashbrs(NLABELS); sihashurs child_urules_ht = make_sihashurs(NLABELS); sihashf parent_weight_ht = make_sihashf(NLABELS); brihashbr brihtbr = make_brihashbr(NLABELS); int n; double weight; urule ur; sihashbrsit bhit; sihashursit uhit; size_t root_label = 0, lhs, cat, rhs[MAXRHS]; while ((n = fscanf(fp, " %lg ", &weight)) == 1) { /* read the count */ lhs = read_cat(fp, si); assert(weight > 0); assert(lhs); if (!root_label) root_label = lhs; fscanf(fp, " " REWRITES); /* read the rewrites symbol */ for (n=0; n<MAXRHS; n++) { /* read the rhs, n is length of rhs */ cat = read_cat(fp, si); if (!cat) break; rhs[n] = cat; } if (n >= MAXRHS) { fprintf(stderr, "read_grammar() in grammar.c: rule rhs too long\n"); exit(EXIT_FAILURE); } switch (n) { case 0: fprintf(stderr, "read_grammar() in grammar.c: rule with empty rhs\n"); exit(EXIT_FAILURE); break; case 1: ur = make_urule(weight, lhs, rhs[0]); push_urule(child_urules_ht, ur->child, ur); sihashf_inc(parent_weight_ht, ur->parent, weight); break; case 2: add_brule(left_brules_ht, brihtbr, weight, lhs, rhs[0], rhs[1]); sihashf_inc(parent_weight_ht, lhs, weight); break; default: { int start, i, j; char bcat[MAXBLABELLEN], *s; si_index bparent, left, right; right = rhs[n-1]; /* rightmost category */ for (start=n-2; start>=1; start--) { i = 0; /* i is index into bcat[] */ for (j=start; j<n; j++) { /* j is index into rhs[] */ if (j!=start) { bcat[i++] = BINSEP; assert(i < MAXBLABELLEN); } s = si_index_string(si, rhs[j]); while (*s) { bcat[i++] = *s++; assert(i < MAXBLABELLEN); }} bcat[i] = '\0'; bparent = si_string_index(si, bcat); left = rhs[start]; add_brule(left_brules_ht, brihtbr, weight, bparent, left, right); sihashf_inc(parent_weight_ht, bparent, weight); right = bparent; } add_brule(left_brules_ht, brihtbr, weight, lhs, rhs[0], right); sihashf_inc(parent_weight_ht, lhs, weight); }}} free_brihashbr(brihtbr); /* free brindex hash table */ { int i; /* normalize grammar rules */ for (bhit = sihashbrsit_init(left_brules_ht); sihashbrsit_ok(bhit); bhit = sihashbrsit_next(bhit)) for (i=0; i<bhit.value.n; i++) bhit.value.e[i]->prob /= sihashf_ref(parent_weight_ht, bhit.value.e[i]->parent); for (uhit = sihashursit_init(child_urules_ht); sihashursit_ok(uhit); uhit = sihashursit_next(uhit)) for (i=0; i<uhit.value.n; i++) uhit.value.e[i]->prob /= sihashf_ref(parent_weight_ht, uhit.value.e[i]->parent); } free_sihashf(parent_weight_ht); { grammar g; g.urs = child_urules_ht; g.brs = left_brules_ht; g.root_label = root_label; return g; } }