static void binary_inside(const grammar g, sihashf parent_entry, sihashf parent_completes, sihashf left_entry, sihashf right_entry) { si_index right; /* if either the left chart cell or the right chart cell is empty, * then we have nothing to do */ if (sihashf_size(left_entry) == 0 || sihashf_size(right_entry) == 0) return; for (right=1; right<=g->nnts; right++) { brule bp = g->brules[right]; if (bp) { /* look up the rule's right category */ FLOAT cr = sihashf_ref(right_entry, right); if (cr>0.0) /* such categories exist in this cell */ for ( ; bp; bp=bp->next) { FLOAT cl; cl = sihashf_ref(left_entry, bp->left); if (cl>0.0) { rulelist rp; if (bp->active_parent) /* actives go straight into chart */ sihashf_inc(parent_entry, bp->active_parent, cl*cr); for (rp=bp->completes; rp; rp=rp->next) { si_index parent_cat = g->rules[rp->ruleid]->e[0]; assert(parent_cat <= g->nnts); sihashf_inc(parent_completes, parent_cat, cl*cr*g->weights[rp->ruleid]); }}}}} }
static void unary_closure_inside(const grammar g, sihashf parent_entry, sihashf parent_completes) { sihashfit childit; for (childit = sihashfit_init(parent_completes); sihashfit_ok(childit); childit = sihashfit_next(childit)) { catproblist pp; for (pp = g->child_parentprob[childit.key]; pp; pp = pp->next) sihashf_inc(parent_entry, pp->cat, childit.value*pp->prob); } }
static chart outside_chart(const grammar g, const si_t si, const chart inside_chart, const vindex terms, FLOAT yieldweight, FLOAT *rule_counts) { int left, right; size_t nwords = terms->n; sihashf root_inside_cell = CHART_ENTRY(inside_chart, 0, nwords); FLOAT root_prob = sihashf_ref(root_inside_cell, g->root_label); chart outside_chart = chart_make(nwords); sihashf root_outside_cell = make_sihashf(CHART_CELLS); catproblist cp; root_prob /= yieldweight; /* pretend we saw this sentence this many times */ /* install root cell */ CHART_ENTRY(outside_chart, 0, nwords) = root_outside_cell; for (cp = g->parent_childprob[g->root_label]; cp; cp = cp->next) if (sihashf_ref(root_inside_cell, cp->cat) > 0.0) sihashf_set(root_outside_cell, cp->cat, cp->prob); increment_unary_counts(g, root_inside_cell, root_outside_cell, root_prob, rule_counts); for (right=nwords; right>=1; right--) for (left=0; left<right; left++) if ((left!=0)||(right!=nwords)) /* skip root cell */ binary_outside(g, left, right, nwords, inside_chart, outside_chart, root_prob, rule_counts); /* now update counts for unary rules expanding to terminals */ for (left=0; left < nwords; left++) { rulelist rl; sihashf outside_cell = CHART_ENTRY(outside_chart, left, left+1); for (rl=g->urules[terms->e[left]]; rl; rl=rl->next) { FLOAT outside_prob = sihashf_ref(outside_cell, g->rules[rl->ruleid]->e[0]) * g->weights[rl->ruleid]; rule_counts[rl->ruleid] += outside_prob/root_prob; sihashf_inc(outside_cell, terms->e[left], outside_prob); } } return outside_chart; }
static void binary_outside(const grammar g, size_t left_pos, size_t right_pos, size_t nwords, const chart inside_chart, chart outside_chart, FLOAT root_prob, FLOAT *rule_counts) { si_index right_cat; FLOAT *completes; sihashf child_outside = make_sihashf(CHART_CELLS); sihashf child_inside = CHART_ENTRY(inside_chart, left_pos, right_pos); CHART_ENTRY(outside_chart, left_pos, right_pos) = child_outside; /* if the inside chart cell is empty, then there's no point calculating * the outside cell */ if (sihashf_size(child_inside) == 0) return; completes = MALLOC((g->nnts+1)*sizeof(FLOAT)); { size_t child_cat; for (child_cat=1; child_cat<=g->nnts; child_cat++) completes[child_cat] = 0.0; } /* try to combine with cells on left */ for (right_cat=1; right_cat<=g->nnts; right_cat++) { brule bp = g->brules[right_cat]; FLOAT right_inside_weight = sihashf_ref(child_inside, right_cat); /* rules and inside category? */ if (bp&&(right_inside_weight>0.0)) { for ( ; bp; bp=bp->next) { size_t far_left_pos; for (far_left_pos=0; far_left_pos<left_pos; far_left_pos++) { FLOAT far_left_inside_weight = sihashf_ref(CHART_ENTRY(inside_chart, far_left_pos, left_pos), bp->left); if (far_left_inside_weight>0.0) { rulelist rp; if (bp->active_parent) { FLOAT parent_outside_weight = sihashf_ref(CHART_ENTRY(outside_chart,far_left_pos,right_pos), bp->active_parent); if (parent_outside_weight>0.0) completes[right_cat] += parent_outside_weight* far_left_inside_weight; } for (rp=bp->completes; rp; rp=rp->next) { FLOAT parent_outside_weight = sihashf_ref(CHART_ENTRY(outside_chart,far_left_pos,right_pos), g->rules[rp->ruleid]->e[0]); if (parent_outside_weight>0.0) { FLOAT parent_left_rule_weight = parent_outside_weight* far_left_inside_weight* g->weights[rp->ruleid]; completes[right_cat] += parent_left_rule_weight ; rule_counts[rp->ruleid] += parent_left_rule_weight* right_inside_weight/root_prob; }}}}}}} /* try to combine with cells on right */ for (right_cat=1; right_cat<=g->nnts; right_cat++) { brule bp; size_t far_right_pos; for (bp=g->brules[right_cat]; bp; bp=bp->next) for (far_right_pos=right_pos+1; far_right_pos<=nwords; far_right_pos++) { FLOAT far_right_inside_weight = sihashf_ref(CHART_ENTRY(inside_chart, right_pos, far_right_pos), right_cat); si_index child_cat=bp->left; FLOAT child_inside_weight = sihashf_ref(child_inside, child_cat); if ((far_right_inside_weight>0.0)&&(child_inside_weight>0.0)) { rulelist rp; if (bp->active_parent) { FLOAT parent_outside_weight = sihashf_ref(CHART_ENTRY(outside_chart, left_pos, far_right_pos), bp->active_parent); if (parent_outside_weight>0.0) { if (child_cat<=g->nnts) completes[child_cat] += parent_outside_weight* far_right_inside_weight; else { assert(child_cat>g->ncats); /* otherwise child_cat is a term */ sihashf_inc(child_outside, child_cat, parent_outside_weight*far_right_inside_weight); } }} for (rp=bp->completes; rp; rp=rp->next) { FLOAT parent_outside_weight = sihashf_ref(CHART_ENTRY(outside_chart, left_pos, far_right_pos), g->rules[rp->ruleid]->e[0]); if (parent_outside_weight>0.0) { FLOAT parent_right_rule_weight = parent_outside_weight* far_right_inside_weight* g->weights[rp->ruleid]; if (child_cat<=g->nnts) completes[child_cat] += parent_right_rule_weight; else { assert(child_cat>g->ncats); /* otherwise child_cat is a term */ sihashf_inc(child_outside,child_cat,parent_right_rule_weight); } /* don't double count the rule! * it's been counted before from the left */ /* rule_counts[rp->ruleid] += parent_right_rule_weight* child_inside_weight/root_prob; */ }}}}} /* unary closure */ /* unary closure for root cell done in outside_chart() */ { si_index parent_cat; for (parent_cat=1; parent_cat<=g->nnts; parent_cat++) { FLOAT parent_outside_weight = completes[parent_cat]; if (parent_outside_weight>0.0) { catproblist cp; for (cp = g->parent_childprob[parent_cat]; cp; cp = cp->next) if (sihashf_ref(child_inside, cp->cat) > 0.0) sihashf_inc(child_outside, cp->cat, cp->prob*parent_outside_weight); }}} /* increment unary rule_counts */ /* rule counts for root cell done in outside_chart() */ increment_unary_counts(g, child_inside, child_outside, root_prob, rule_counts); FREE(completes); }
static chart inside_chart(vindex terms, grammar g, si_t si, FLOAT wordscale) { int left, right, mid; chart c = chart_make(terms->n); /* parent_completes is a sihashf of completed categories (i.e., not the * new, active categories produced by binarization). Unary closure * applies to these categories. * The pre-unary-closure parent weights are stored in parent_completes * before unary closure is applied to them */ /* Inside pass */ /* insert lexical items */ for (left=0; left< (int) terms->n; left++) { rulelist rl; si_index terminal = terms->e[left]; sihashf chart_entry = make_sihashf(NLABELS); CHART_ENTRY(c, left, left+1) = chart_entry; assert(terminal>0); if (terminal<=g->nnts) { fprintf(stderr, "Error in inside_chart() in expected-counts.c: " "input contains nonterminal symbol %s\n", si_index_string(si, terminal)); exit(EXIT_FAILURE); } if (terminal>g->ncats) { fprintf(stderr, "Error in inside_chart() in expected-counts.c:" " input contains unknown terminal %s\n", si_index_string(si, terminal)); exit(EXIT_FAILURE); } /* no need to actually enter terminal into chart */ /* sihashf_set(chart_entry, terminal, 1.0); */ rl = g->urules[terminal]; assert(rl); /* check there are rules for this terminal */ for ( ; rl; rl=rl->next) { si_index preterminal = g->rules[rl->ruleid]->e[0]; FLOAT preterminal_prob = g->weights[rl->ruleid]*wordscale; catproblist pp; /* assert(rl->ruleid<g->nrules); */ assert(g->child_parentprob[preterminal]); for (pp = g->child_parentprob[preterminal]; pp; pp = pp->next) sihashf_inc(chart_entry, pp->cat, preterminal_prob*pp->prob); } /* fprintf(stderr, "Chart entry %d-%d\n", (int) left, (int) left+1); chart_entry_display(stderr, chart_entry, si); */ } for (right=2; right<= (int) terms->n; right++) for (left=right-2; left>=0; left--) { sihashf parent_completes = make_sihashf(COMPLETE_CELLS); sihashf chart_entry = make_sihashf(CHART_CELLS); CHART_ENTRY(c, left, right) = chart_entry; for (mid=left+1; mid<right; mid++) binary_inside(g, chart_entry, parent_completes, CHART_ENTRY(c,left,mid), CHART_ENTRY(c,mid,right)); unary_closure_inside(g, chart_entry, parent_completes); free_sihashf(parent_completes); /* fprintf(stdout, "Chart entry %d-%d\n", (int) left, (int) right); */ /* chart_entry_display(stdout, CHART_ENTRY(inside,left,right), si); */ } return c; }
grammar read_grammar(FILE *fp, si_t si) { sihashbrs left_brules_ht = make_sihashbrs(NLABELS); sihashurs child_urules_ht = make_sihashurs(NLABELS); sihashf parent_weight_ht = make_sihashf(NLABELS); brihashbr brihtbr = make_brihashbr(NLABELS); int n; double weight; urule ur; sihashbrsit bhit; sihashursit uhit; size_t root_label = 0, lhs, cat, rhs[MAXRHS]; while ((n = fscanf(fp, " %lg ", &weight)) == 1) { /* read the count */ lhs = read_cat(fp, si); assert(weight > 0); assert(lhs); if (!root_label) root_label = lhs; fscanf(fp, " " REWRITES); /* read the rewrites symbol */ for (n=0; n<MAXRHS; n++) { /* read the rhs, n is length of rhs */ cat = read_cat(fp, si); if (!cat) break; rhs[n] = cat; } if (n >= MAXRHS) { fprintf(stderr, "read_grammar() in grammar.c: rule rhs too long\n"); exit(EXIT_FAILURE); } switch (n) { case 0: fprintf(stderr, "read_grammar() in grammar.c: rule with empty rhs\n"); exit(EXIT_FAILURE); break; case 1: ur = make_urule(weight, lhs, rhs[0]); push_urule(child_urules_ht, ur->child, ur); sihashf_inc(parent_weight_ht, ur->parent, weight); break; case 2: add_brule(left_brules_ht, brihtbr, weight, lhs, rhs[0], rhs[1]); sihashf_inc(parent_weight_ht, lhs, weight); break; default: { int start, i, j; char bcat[MAXBLABELLEN], *s; si_index bparent, left, right; right = rhs[n-1]; /* rightmost category */ for (start=n-2; start>=1; start--) { i = 0; /* i is index into bcat[] */ for (j=start; j<n; j++) { /* j is index into rhs[] */ if (j!=start) { bcat[i++] = BINSEP; assert(i < MAXBLABELLEN); } s = si_index_string(si, rhs[j]); while (*s) { bcat[i++] = *s++; assert(i < MAXBLABELLEN); }} bcat[i] = '\0'; bparent = si_string_index(si, bcat); left = rhs[start]; add_brule(left_brules_ht, brihtbr, weight, bparent, left, right); sihashf_inc(parent_weight_ht, bparent, weight); right = bparent; } add_brule(left_brules_ht, brihtbr, weight, lhs, rhs[0], right); sihashf_inc(parent_weight_ht, lhs, weight); }}} free_brihashbr(brihtbr); /* free brindex hash table */ { int i; /* normalize grammar rules */ for (bhit = sihashbrsit_init(left_brules_ht); sihashbrsit_ok(bhit); bhit = sihashbrsit_next(bhit)) for (i=0; i<bhit.value.n; i++) bhit.value.e[i]->prob /= sihashf_ref(parent_weight_ht, bhit.value.e[i]->parent); for (uhit = sihashursit_init(child_urules_ht); sihashursit_ok(uhit); uhit = sihashursit_next(uhit)) for (i=0; i<uhit.value.n; i++) uhit.value.e[i]->prob /= sihashf_ref(parent_weight_ht, uhit.value.e[i]->parent); } free_sihashf(parent_weight_ht); { grammar g; g.urs = child_urules_ht; g.brs = left_brules_ht; g.root_label = root_label; return g; } }