static chart outside_chart(const grammar g, const si_t si, const chart inside_chart, const vindex terms, FLOAT yieldweight, FLOAT *rule_counts) { int left, right; size_t nwords = terms->n; sihashf root_inside_cell = CHART_ENTRY(inside_chart, 0, nwords); FLOAT root_prob = sihashf_ref(root_inside_cell, g->root_label); chart outside_chart = chart_make(nwords); sihashf root_outside_cell = make_sihashf(CHART_CELLS); catproblist cp; root_prob /= yieldweight; /* pretend we saw this sentence this many times */ /* install root cell */ CHART_ENTRY(outside_chart, 0, nwords) = root_outside_cell; for (cp = g->parent_childprob[g->root_label]; cp; cp = cp->next) if (sihashf_ref(root_inside_cell, cp->cat) > 0.0) sihashf_set(root_outside_cell, cp->cat, cp->prob); increment_unary_counts(g, root_inside_cell, root_outside_cell, root_prob, rule_counts); for (right=nwords; right>=1; right--) for (left=0; left<right; left++) if ((left!=0)||(right!=nwords)) /* skip root cell */ binary_outside(g, left, right, nwords, inside_chart, outside_chart, root_prob, rule_counts); /* now update counts for unary rules expanding to terminals */ for (left=0; left < nwords; left++) { rulelist rl; sihashf outside_cell = CHART_ENTRY(outside_chart, left, left+1); for (rl=g->urules[terms->e[left]]; rl; rl=rl->next) { FLOAT outside_prob = sihashf_ref(outside_cell, g->rules[rl->ruleid]->e[0]) * g->weights[rl->ruleid]; rule_counts[rl->ruleid] += outside_prob/root_prob; sihashf_inc(outside_cell, terms->e[left], outside_prob); } } return outside_chart; }
chart cky(struct vindex terms, grammar g, si_t si) { int left, mid; chart c; c = chart_make(terms.n); /* insert lexical items */ for (left = 0; left < (int) terms.n; left++) { si_index label = terms.e[left]; sihashcc chart_entry = CHART_ENTRY(c, left, left+1); sihashcc left_vertex = c->vertex[left]; chart_cell cell = add_edge(chart_entry, label, NULL, NULL, 1.0, left+1, left_vertex); assert(cell); /* check that cell was actually added */ follow_unary(cell, chart_entry, g, left+1, left_vertex); } /* actually do syntactic rules! */ for (left = (int) terms.n-1; left >= 0; left--) { for (mid = left+1; mid < (int) terms.n; mid++) { sihashcc chart_entry = CHART_ENTRY(c, left, mid); /* unary close cell spanning from left to mid */ if (mid - left > 1) apply_unary(chart_entry, g, mid, c->vertex[left]); /* now apply binary rules */ apply_binary(chart_entry, left, mid, c, g); } /* apply unary rules to chart cells spanning from left to end of sentence * there's no need to apply binary rules to these */ apply_unary(CHART_ENTRY(c, left, terms.n), g, (int) terms.n, c->vertex[left]); /* printf("Chart entry %d-%d\n", (int) left, (int) right); chart_entry_display(CHART_ENTRY(c,left,right), si); */ } return c; }
void chart_display(FILE *fp, chart c, size_t n, si_t si) { size_t left, gap; for (gap=1; gap<=n; gap++) for (left=0; left<=n-gap; left++) { sihashf chart_cell = CHART_ENTRY(c, left, left+gap); if (sihashf_size(chart_cell) > 0) { fprintf(fp, "cell %d-%d\n", (int) left, (int) (left+gap)); chart_entry_display(fp, chart_cell, si); } } }
int consistent_preterm_outsides(chart outside, vindex terms, FLOAT root_prob) { size_t i; for (i=0; i<terms->n; i++) if (fabs(sihashf_ref(CHART_ENTRY(outside,i,i+1),terms->e[i])-root_prob)/ root_prob > EPSILON) return(0); return(1); }
chart chart_make(size_t n) { size_t i, left, right, nn = CHART_SIZE(n); chart c = MALLOC(sizeof(struct chart)); c->vertex = MALLOC((n+1)*sizeof(sihashcc)); for (i = 0; i <= n; i++) c->vertex[i] = make_sihashcc(CHART_CELLS); c->cell = MALLOC(nn*sizeof(sihashcc)); for (left = 0; left < n; left++) for (right = left+1; right <= n; right++) CHART_ENTRY(c, left, right) = make_sihashcc(left+1 == right ? NLABELS : CHART_CELLS); return c; }
static void apply_binary(sihashcc left_entry, int left, int mid, chart c, grammar g) { sihashbrsit brsit; size_t i; for (brsit=sihashbrsit_init(g.brs); sihashbrsit_ok(brsit); brsit = sihashbrsit_next(brsit)) { /* look up the rule's left category */ chart_cell cl = sihashcc_ref(left_entry, brsit.key); if (cl) /* such categories exist in this cell */ for (i=0; i<brsit.value.n; i++) { chart_cell cr; for (cr = sihashcc_ref(c->vertex[mid], brsit.value.e[i]->right); cr; cr = cr->next) add_edge(CHART_ENTRY(c,left,cr->rightpos), brsit.value.e[i]->parent, &cl->tree, &cr->tree, cl->prob * cr->prob * brsit.value.e[i]->prob, cr->rightpos, c->vertex[left]); }}}
int main(int argc, char **argv) { si_t si = make_si(1024); FILE *grammarfp = stdin, *yieldfp; FILE *tracefp = NULL; /* trace output */ FILE *summaryfp = stderr; /* end of parse stats output */ FILE *parsefp = stdout; /* parse trees */ FILE *probfp = NULL; /* max_neglog_prob */ chart_cell root_cell; grammar g; chart c; vindex terms; int maxsentlen = 0; int sentenceno = 0, parsed_sentences = 0, failed_sentences = 0; double sum_neglog_prob = 0; int sentfrom = 0; int sentto = 0; srand(RAND_SEED); /* seed random number generator */ if (argc<2 || argc>6) { fprintf(stderr, "%s yieldfile [maxsentlen [grammarfile [sentfrom sentto]]]\n", argv[0]); exit(EXIT_FAILURE); } if ((yieldfp = fopen(argv[1], "r")) == NULL) { fprintf(stderr, "%s: Couldn't open yieldfile %s\n", argv[0], argv[1]); exit(EXIT_FAILURE); } if (argc >= 3) if (!sscanf(argv[2], "%d", &maxsentlen)) { fprintf(stderr, "%s: Couldn't parse maxsentlen %s\n", argv[0], argv[2]); exit(EXIT_FAILURE); } if (argc >= 4) if ((grammarfp = fopen(argv[3], "r")) == NULL) { fprintf(stderr, "%s: Couldn't open grammarfile %s\n", argv[0], argv[3]); exit(EXIT_FAILURE); } if (argc >= 6) { if (!sscanf(argv[4], "%d", &sentfrom)) { fprintf(stderr, "%s: Couldn't parse sentfrom %s\n", argv[0], argv[4]); exit(EXIT_FAILURE); } if (!sscanf(argv[5], "%d", &sentto)) { fprintf(stderr, "%s: Couldn't parse sentto %s\n", argv[0], argv[5]); exit(EXIT_FAILURE); } } g = read_grammar(grammarfp, si); /* write_grammar(tracefp, g, si); */ while ((terms = read_terms(yieldfp, si))) { sentenceno++; if (sentfrom && sentenceno < sentfrom) { vindex_free(terms); continue; } if (sentto && sentenceno > sentto) { vindex_free(terms); break; } /* skip if sentence is too long */ if (!maxsentlen || (int) terms->n <= maxsentlen) { size_t i; if (tracefp) { fprintf(tracefp, "\nSentence %d:\n", sentenceno); for (i=0; i<terms->n; i++) fprintf(tracefp, " %s", si_index_string(si, terms->e[i])); fprintf(tracefp, "\n"); } c = cky(*terms, g, si); /* fetch best root node */ root_cell = sihashcc_ref(CHART_ENTRY(c, 0, terms->n), g.root_label); if (root_cell) { tree parse_tree = bintree_tree(&root_cell->tree, si); double prob = (double) root_cell->prob; parsed_sentences++; assert(prob > 0.0); sum_neglog_prob -= log(prob); if (probfp) fprintf(probfp, "max_neglog_prob(%d, %g).\n", sentenceno, -log(prob)); if (tracefp) fprintf(tracefp, " Prob = %g\n", prob); if (parsefp) { write_tree(parsefp, parse_tree, si); fprintf(parsefp, "\n"); /* write_prolog_tree(parsefp, parse_tree, si); */ } free_tree(parse_tree); } else { failed_sentences++; if (tracefp) fprintf(tracefp, "Failed to parse\n"); if (parsefp) fprintf(parsefp, "parse_failure.\n"); } chart_free(c, terms->n); /* free the chart */ } else { /* sentence too long */ if (parsefp) fprintf(parsefp, "too_long.\n"); } vindex_free(terms); /* free the terms */ assert(trees_allocated == 0); assert(bintrees_allocated == 0); } free_grammar(g); si_free(si); if (summaryfp) { fprintf(summaryfp, "\n%d/%d = %g%% test sentences met the length criteron," " of which %d/%d = %g%% were parsed\n", parsed_sentences+failed_sentences, sentenceno, (double) (100.0 * (parsed_sentences+failed_sentences)) / sentenceno, parsed_sentences, parsed_sentences+failed_sentences, (double) (100.0 * parsed_sentences) / (parsed_sentences + failed_sentences)); fprintf(summaryfp, "Sum(-log prob) = %g\n", sum_neglog_prob); } /* check that everything has been deallocated */ /* printf("mmm_blocks_allocated = %ld\n", (long) mmm_blocks_allocated); */ assert(mmm_blocks_allocated == 0); exit(EXIT_SUCCESS); }
FLOAT expected_rule_counts(const grammar g, const si_t si, FILE *yieldfp, FILE *tracefp, FILE *summaryfp, int debuglevel, int maxsentlen, FLOAT minruleprob, FLOAT wordscale, FLOAT *rule_counts, FLOAT *sum_yieldweights, int weighted_yields_flag) { vindex terms; FLOAT root_prob; chart inside, outside; long sentenceno = 0, parsed_sentences = 0, failed_sentences = 0; double sum_neglog_prob = 0.0; float yieldweight; *sum_yieldweights = 0; /* FLOAT *rule_counts = CALLOC(g->nrules, sizeof(FLOAT)); */ { size_t i; /* zero rule counts */ for (i=0; i<g->nrules; i++) rule_counts[i] = 0.0; } compute_unary_closure(g, minruleprob); /* compute unary_close */ rewind(yieldfp); /* rewind the tree file */ while ((terms = read_terms(weighted_yields_flag, yieldfp, si, &yieldweight))) { sentenceno++; if (summaryfp && debuglevel >= 10000) { size_t i; fprintf(tracefp, "\nSentence %ld:\n", sentenceno); for (i=0; i<terms->n; i++) fprintf(tracefp, " %s", si_index_string(si, terms->e[i])); fprintf(tracefp, "\n"); } /* skip if sentence is too long */ if (!maxsentlen || (int) terms->n <= maxsentlen) { inside = inside_chart(terms, g, si, wordscale); /* chart_display(stdout, inside, terms->n, si); */ root_prob = sihashf_ref(CHART_ENTRY(inside, 0, terms->n), g->root_label); if (root_prob > 0.0) { if (tracefp && debuglevel >= 10000) fprintf(tracefp, "Sum of derivation weights = %g\n", root_prob); sum_neglog_prob -= yieldweight*(log(root_prob)-terms->n*log(wordscale)); *sum_yieldweights += yieldweight*terms->n; parsed_sentences++; outside = outside_chart(g, si, inside, terms, yieldweight, rule_counts); /* assert(consistent_preterm_outsides(outside, terms, root_prob)); */ /* chart_display(stdout, outside, terms->n, si); */ chart_free(outside, terms->n); } else { failed_sentences++; if (tracefp && debuglevel >= 10000) fprintf(tracefp, "Failed to parse.\n"); } chart_free(inside, terms->n); /* free the chart */ } else { /* sentence too long */ if (tracefp && debuglevel >= 10000) fprintf(tracefp, "Too long to parse.\n"); } vindex_free(terms); /* and its terms */ } /* free unary closure */ free_unary_closure(g); if (summaryfp && debuglevel >= 1000) { if (failed_sentences>0) fprintf(summaryfp, " %ld sentences failed to parse", (long) failed_sentences); } return(sum_neglog_prob); }
static void binary_outside(const grammar g, size_t left_pos, size_t right_pos, size_t nwords, const chart inside_chart, chart outside_chart, FLOAT root_prob, FLOAT *rule_counts) { si_index right_cat; FLOAT *completes; sihashf child_outside = make_sihashf(CHART_CELLS); sihashf child_inside = CHART_ENTRY(inside_chart, left_pos, right_pos); CHART_ENTRY(outside_chart, left_pos, right_pos) = child_outside; /* if the inside chart cell is empty, then there's no point calculating * the outside cell */ if (sihashf_size(child_inside) == 0) return; completes = MALLOC((g->nnts+1)*sizeof(FLOAT)); { size_t child_cat; for (child_cat=1; child_cat<=g->nnts; child_cat++) completes[child_cat] = 0.0; } /* try to combine with cells on left */ for (right_cat=1; right_cat<=g->nnts; right_cat++) { brule bp = g->brules[right_cat]; FLOAT right_inside_weight = sihashf_ref(child_inside, right_cat); /* rules and inside category? */ if (bp&&(right_inside_weight>0.0)) { for ( ; bp; bp=bp->next) { size_t far_left_pos; for (far_left_pos=0; far_left_pos<left_pos; far_left_pos++) { FLOAT far_left_inside_weight = sihashf_ref(CHART_ENTRY(inside_chart, far_left_pos, left_pos), bp->left); if (far_left_inside_weight>0.0) { rulelist rp; if (bp->active_parent) { FLOAT parent_outside_weight = sihashf_ref(CHART_ENTRY(outside_chart,far_left_pos,right_pos), bp->active_parent); if (parent_outside_weight>0.0) completes[right_cat] += parent_outside_weight* far_left_inside_weight; } for (rp=bp->completes; rp; rp=rp->next) { FLOAT parent_outside_weight = sihashf_ref(CHART_ENTRY(outside_chart,far_left_pos,right_pos), g->rules[rp->ruleid]->e[0]); if (parent_outside_weight>0.0) { FLOAT parent_left_rule_weight = parent_outside_weight* far_left_inside_weight* g->weights[rp->ruleid]; completes[right_cat] += parent_left_rule_weight ; rule_counts[rp->ruleid] += parent_left_rule_weight* right_inside_weight/root_prob; }}}}}}} /* try to combine with cells on right */ for (right_cat=1; right_cat<=g->nnts; right_cat++) { brule bp; size_t far_right_pos; for (bp=g->brules[right_cat]; bp; bp=bp->next) for (far_right_pos=right_pos+1; far_right_pos<=nwords; far_right_pos++) { FLOAT far_right_inside_weight = sihashf_ref(CHART_ENTRY(inside_chart, right_pos, far_right_pos), right_cat); si_index child_cat=bp->left; FLOAT child_inside_weight = sihashf_ref(child_inside, child_cat); if ((far_right_inside_weight>0.0)&&(child_inside_weight>0.0)) { rulelist rp; if (bp->active_parent) { FLOAT parent_outside_weight = sihashf_ref(CHART_ENTRY(outside_chart, left_pos, far_right_pos), bp->active_parent); if (parent_outside_weight>0.0) { if (child_cat<=g->nnts) completes[child_cat] += parent_outside_weight* far_right_inside_weight; else { assert(child_cat>g->ncats); /* otherwise child_cat is a term */ sihashf_inc(child_outside, child_cat, parent_outside_weight*far_right_inside_weight); } }} for (rp=bp->completes; rp; rp=rp->next) { FLOAT parent_outside_weight = sihashf_ref(CHART_ENTRY(outside_chart, left_pos, far_right_pos), g->rules[rp->ruleid]->e[0]); if (parent_outside_weight>0.0) { FLOAT parent_right_rule_weight = parent_outside_weight* far_right_inside_weight* g->weights[rp->ruleid]; if (child_cat<=g->nnts) completes[child_cat] += parent_right_rule_weight; else { assert(child_cat>g->ncats); /* otherwise child_cat is a term */ sihashf_inc(child_outside,child_cat,parent_right_rule_weight); } /* don't double count the rule! * it's been counted before from the left */ /* rule_counts[rp->ruleid] += parent_right_rule_weight* child_inside_weight/root_prob; */ }}}}} /* unary closure */ /* unary closure for root cell done in outside_chart() */ { si_index parent_cat; for (parent_cat=1; parent_cat<=g->nnts; parent_cat++) { FLOAT parent_outside_weight = completes[parent_cat]; if (parent_outside_weight>0.0) { catproblist cp; for (cp = g->parent_childprob[parent_cat]; cp; cp = cp->next) if (sihashf_ref(child_inside, cp->cat) > 0.0) sihashf_inc(child_outside, cp->cat, cp->prob*parent_outside_weight); }}} /* increment unary rule_counts */ /* rule counts for root cell done in outside_chart() */ increment_unary_counts(g, child_inside, child_outside, root_prob, rule_counts); FREE(completes); }
static chart inside_chart(vindex terms, grammar g, si_t si, FLOAT wordscale) { int left, right, mid; chart c = chart_make(terms->n); /* parent_completes is a sihashf of completed categories (i.e., not the * new, active categories produced by binarization). Unary closure * applies to these categories. * The pre-unary-closure parent weights are stored in parent_completes * before unary closure is applied to them */ /* Inside pass */ /* insert lexical items */ for (left=0; left< (int) terms->n; left++) { rulelist rl; si_index terminal = terms->e[left]; sihashf chart_entry = make_sihashf(NLABELS); CHART_ENTRY(c, left, left+1) = chart_entry; assert(terminal>0); if (terminal<=g->nnts) { fprintf(stderr, "Error in inside_chart() in expected-counts.c: " "input contains nonterminal symbol %s\n", si_index_string(si, terminal)); exit(EXIT_FAILURE); } if (terminal>g->ncats) { fprintf(stderr, "Error in inside_chart() in expected-counts.c:" " input contains unknown terminal %s\n", si_index_string(si, terminal)); exit(EXIT_FAILURE); } /* no need to actually enter terminal into chart */ /* sihashf_set(chart_entry, terminal, 1.0); */ rl = g->urules[terminal]; assert(rl); /* check there are rules for this terminal */ for ( ; rl; rl=rl->next) { si_index preterminal = g->rules[rl->ruleid]->e[0]; FLOAT preterminal_prob = g->weights[rl->ruleid]*wordscale; catproblist pp; /* assert(rl->ruleid<g->nrules); */ assert(g->child_parentprob[preterminal]); for (pp = g->child_parentprob[preterminal]; pp; pp = pp->next) sihashf_inc(chart_entry, pp->cat, preterminal_prob*pp->prob); } /* fprintf(stderr, "Chart entry %d-%d\n", (int) left, (int) left+1); chart_entry_display(stderr, chart_entry, si); */ } for (right=2; right<= (int) terms->n; right++) for (left=right-2; left>=0; left--) { sihashf parent_completes = make_sihashf(COMPLETE_CELLS); sihashf chart_entry = make_sihashf(CHART_CELLS); CHART_ENTRY(c, left, right) = chart_entry; for (mid=left+1; mid<right; mid++) binary_inside(g, chart_entry, parent_completes, CHART_ENTRY(c,left,mid), CHART_ENTRY(c,mid,right)); unary_closure_inside(g, chart_entry, parent_completes); free_sihashf(parent_completes); /* fprintf(stdout, "Chart entry %d-%d\n", (int) left, (int) right); */ /* chart_entry_display(stdout, CHART_ENTRY(inside,left,right), si); */ } return c; }