Beispiel #1
0
static chart
outside_chart(const grammar g, const si_t si, const chart inside_chart,
	      const vindex terms, FLOAT yieldweight, FLOAT *rule_counts)
{
  int      left, right;
  size_t   nwords = terms->n;
  sihashf  root_inside_cell = CHART_ENTRY(inside_chart, 0, nwords);
  FLOAT    root_prob = sihashf_ref(root_inside_cell, g->root_label);
  chart    outside_chart = chart_make(nwords);
  sihashf  root_outside_cell = make_sihashf(CHART_CELLS);
  catproblist cp;
  
  root_prob /= yieldweight; /* pretend we saw this sentence this many times */

  /* install root cell */
  CHART_ENTRY(outside_chart, 0, nwords) = root_outside_cell; 

  for (cp = g->parent_childprob[g->root_label]; cp; cp = cp->next) 
    if (sihashf_ref(root_inside_cell, cp->cat) > 0.0)
      sihashf_set(root_outside_cell, cp->cat, cp->prob);

  increment_unary_counts(g, root_inside_cell, root_outside_cell, root_prob, 
			 rule_counts);

  for (right=nwords; right>=1; right--)
    for (left=0; left<right; left++)
      if ((left!=0)||(right!=nwords))         /* skip root cell */
	binary_outside(g, left, right, nwords, 
		       inside_chart, outside_chart, root_prob, rule_counts);

  /* now update counts for unary rules expanding to terminals */

  for (left=0; left < nwords; left++) {
    rulelist rl;
    sihashf  outside_cell = CHART_ENTRY(outside_chart, left, left+1);
    for (rl=g->urules[terms->e[left]]; rl; rl=rl->next) {
      FLOAT outside_prob = sihashf_ref(outside_cell,
				       g->rules[rl->ruleid]->e[0]) *
                           g->weights[rl->ruleid];
      rule_counts[rl->ruleid] += outside_prob/root_prob;
      sihashf_inc(outside_cell, terms->e[left], outside_prob);
    }
  }

  return outside_chart;
}
Beispiel #2
0
chart
cky(struct vindex terms, grammar g, si_t si)
{
  int left, mid;
  chart c;

  c = chart_make(terms.n);
  
  /* insert lexical items */

  for (left = 0; left < (int) terms.n; left++) {
    si_index	label = terms.e[left];
    sihashcc    chart_entry = CHART_ENTRY(c, left, left+1);
    sihashcc    left_vertex = c->vertex[left];
    chart_cell  cell = add_edge(chart_entry, label, NULL, NULL, 1.0, 
				left+1, left_vertex);    
    
    assert(cell);  /* check that cell was actually added */
    follow_unary(cell, chart_entry, g, left+1, left_vertex);
  }

  /* actually do syntactic rules! */

  for (left = (int) terms.n-1; left >= 0; left--) {
    for (mid = left+1; mid < (int) terms.n; mid++) {
      sihashcc chart_entry = CHART_ENTRY(c, left, mid);
      /* unary close cell spanning from left to mid */
      if (mid - left > 1)
	apply_unary(chart_entry, g, mid, c->vertex[left]);
      /* now apply binary rules */
      apply_binary(chart_entry, left, mid, c, g);
    }
    /* apply unary rules to chart cells spanning from left to end of sentence
     * there's no need to apply binary rules to these
     */
    apply_unary(CHART_ENTRY(c, left, terms.n), g, 
		(int) terms.n, c->vertex[left]);
    /*
      printf("Chart entry %d-%d\n", (int) left, (int) right);
      chart_entry_display(CHART_ENTRY(c,left,right), si);
     */
  }
  return c;
}
Beispiel #3
0
void
chart_display(FILE *fp, chart c, size_t n, si_t si)
{
  size_t left, gap;

  for (gap=1; gap<=n; gap++)
    for (left=0; left<=n-gap; left++) {
      sihashf chart_cell = CHART_ENTRY(c, left, left+gap);
      if (sihashf_size(chart_cell) > 0) {
	fprintf(fp, "cell %d-%d\n", (int) left, (int) (left+gap));
	chart_entry_display(fp, chart_cell, si);
      }
    }
}
Beispiel #4
0
int
consistent_preterm_outsides(chart outside, vindex terms,
			    FLOAT root_prob)
{
  size_t i;

  for (i=0; i<terms->n; i++) 
    if (fabs(sihashf_ref(CHART_ENTRY(outside,i,i+1),terms->e[i])-root_prob)/
	 root_prob 
	> EPSILON)
      return(0);
  
  return(1);
}
Beispiel #5
0
chart
chart_make(size_t n)
{
  size_t  i, left, right, nn = CHART_SIZE(n);
  chart   c = MALLOC(sizeof(struct chart));
  
  c->vertex = MALLOC((n+1)*sizeof(sihashcc));

  for (i = 0; i <= n; i++)
    c->vertex[i] = make_sihashcc(CHART_CELLS);

  c->cell = MALLOC(nn*sizeof(sihashcc));
  
  for (left = 0; left < n; left++)
    for (right = left+1; right <= n; right++) 
      CHART_ENTRY(c, left, right) = 
	make_sihashcc(left+1 == right ? NLABELS : CHART_CELLS);

  return c;
}
Beispiel #6
0
static void
apply_binary(sihashcc left_entry, int left, int mid, chart c, grammar g)
{
  sihashbrsit	brsit;
  size_t	i;

  for (brsit=sihashbrsit_init(g.brs); sihashbrsit_ok(brsit); 
       brsit = sihashbrsit_next(brsit)) {
    /* look up the rule's left category */
    chart_cell cl = sihashcc_ref(left_entry, brsit.key);
    if (cl)	/* such categories exist in this cell */
      for (i=0; i<brsit.value.n; i++) {
	chart_cell cr;
	for (cr = sihashcc_ref(c->vertex[mid], brsit.value.e[i]->right);
	     cr; cr = cr->next) 
	  add_edge(CHART_ENTRY(c,left,cr->rightpos), brsit.value.e[i]->parent,
		   &cl->tree, &cr->tree,
		   cl->prob * cr->prob *  brsit.value.e[i]->prob,
		   cr->rightpos, c->vertex[left]);
      }}}
Beispiel #7
0
int      
main(int argc, char **argv)
{
  si_t          si = make_si(1024);
  FILE          *grammarfp = stdin, *yieldfp;
  FILE		*tracefp = NULL;  	/* trace output */
  FILE		*summaryfp = stderr;	/* end of parse stats output */
  FILE		*parsefp = stdout;      /* parse trees */
  FILE		*probfp = NULL;         /* max_neglog_prob */

  chart_cell	root_cell;
  grammar	g;
  chart		c;
  vindex 	terms;
  int		maxsentlen = 0;
  int           sentenceno = 0, parsed_sentences = 0, failed_sentences = 0;
  double	sum_neglog_prob = 0;
  int           sentfrom = 0;
  int           sentto = 0;

  srand(RAND_SEED);	/* seed random number generator */

  if (argc<2 || argc>6) {
    fprintf(stderr, "%s yieldfile [maxsentlen [grammarfile [sentfrom sentto]]]\n", argv[0]);
    exit(EXIT_FAILURE);
  }

  if ((yieldfp = fopen(argv[1], "r")) == NULL) {
    fprintf(stderr, "%s: Couldn't open yieldfile %s\n", argv[0], argv[1]);
    exit(EXIT_FAILURE);
  }

  if (argc >= 3)
    if (!sscanf(argv[2], "%d", &maxsentlen)) {
      fprintf(stderr, "%s: Couldn't parse maxsentlen %s\n", argv[0], argv[2]);
      exit(EXIT_FAILURE);
    }

  if (argc >= 4)
    if ((grammarfp = fopen(argv[3], "r")) == NULL) {
      fprintf(stderr, "%s: Couldn't open grammarfile %s\n", argv[0], argv[3]);
      exit(EXIT_FAILURE);
    }

  if (argc >= 6) {
    if (!sscanf(argv[4], "%d", &sentfrom)) {
      fprintf(stderr, "%s: Couldn't parse sentfrom %s\n", argv[0], argv[4]);
      exit(EXIT_FAILURE);
    }
    if (!sscanf(argv[5], "%d", &sentto)) {
      fprintf(stderr, "%s: Couldn't parse sentto %s\n", argv[0], argv[5]);
      exit(EXIT_FAILURE);
    }
  }

  g = read_grammar(grammarfp, si);
  /* write_grammar(tracefp, g, si); */

  while ((terms = read_terms(yieldfp, si))) {
    sentenceno++;

    if (sentfrom && sentenceno < sentfrom) {
      vindex_free(terms);
      continue;
    }
    if (sentto && sentenceno > sentto) {
      vindex_free(terms);
      break;
    }

    /* skip if sentence is too long */
    if (!maxsentlen || (int) terms->n <= maxsentlen) { 
      size_t	i;

      if (tracefp) {
	fprintf(tracefp, "\nSentence %d:\n", sentenceno);
	for (i=0; i<terms->n; i++)
	  fprintf(tracefp, " %s", si_index_string(si, terms->e[i]));
	fprintf(tracefp, "\n");
      }
     
      c = cky(*terms, g, si);

      /* fetch best root node */

      root_cell = sihashcc_ref(CHART_ENTRY(c, 0, terms->n), g.root_label);

      if (root_cell) {
	tree parse_tree = bintree_tree(&root_cell->tree, si);
	double prob = (double) root_cell->prob;

	parsed_sentences++;
	assert(prob > 0.0);
	sum_neglog_prob -= log(prob);

	if (probfp)
	  fprintf(probfp, "max_neglog_prob(%d, %g).\n", 
		  sentenceno, -log(prob)); 

	if (tracefp) 
	  fprintf(tracefp, " Prob = %g\n", prob);

	if (parsefp) {
	  write_tree(parsefp, parse_tree, si);
	  fprintf(parsefp, "\n");
	  /* write_prolog_tree(parsefp, parse_tree, si); */
	}

	free_tree(parse_tree);
      }

      else {
	failed_sentences++;
	if (tracefp)
	  fprintf(tracefp, "Failed to parse\n");
	if (parsefp)
	  fprintf(parsefp, "parse_failure.\n");
      }

      chart_free(c, terms->n);			/* free the chart */
    }
    else { 					/* sentence too long */
      if (parsefp)
	fprintf(parsefp, "too_long.\n");
    }

    vindex_free(terms);				/*  free the terms */
    assert(trees_allocated == 0);
    assert(bintrees_allocated == 0);
  }
  free_grammar(g);
  si_free(si);

  if (summaryfp) {
    fprintf(summaryfp, "\n%d/%d = %g%% test sentences met the length criteron,"
	    " of which %d/%d = %g%% were parsed\n", 
	    parsed_sentences+failed_sentences, sentenceno,
	    (double) (100.0 * (parsed_sentences+failed_sentences)) / 
	                       sentenceno,
	    parsed_sentences, parsed_sentences+failed_sentences, 
	    (double) (100.0 * parsed_sentences) / 
                              (parsed_sentences + failed_sentences));
    fprintf(summaryfp, "Sum(-log prob) = %g\n", sum_neglog_prob);
  }

  /* check that everything has been deallocated */
  /* printf("mmm_blocks_allocated = %ld\n", (long) mmm_blocks_allocated); */
  assert(mmm_blocks_allocated == 0);		
  exit(EXIT_SUCCESS);
}
Beispiel #8
0
FLOAT 
expected_rule_counts(const grammar g, const si_t si, FILE *yieldfp, 
		     FILE *tracefp, FILE *summaryfp, int debuglevel,
		     int maxsentlen, FLOAT minruleprob, FLOAT wordscale,
		     FLOAT *rule_counts, FLOAT *sum_yieldweights,
		     int weighted_yields_flag)
{
  vindex  terms;
  FLOAT	  root_prob;
  chart	  inside, outside;
  long    sentenceno = 0, parsed_sentences = 0, failed_sentences = 0;
  double  sum_neglog_prob = 0.0;
  float   yieldweight;

  *sum_yieldweights = 0;
  /*  FLOAT *rule_counts = CALLOC(g->nrules, sizeof(FLOAT)); */

  { size_t i;                    /* zero rule counts */
    for (i=0; i<g->nrules; i++)
      rule_counts[i] = 0.0;
  }

  compute_unary_closure(g, minruleprob); /* compute unary_close */

  rewind(yieldfp);               /* rewind the tree file */

  while ((terms = read_terms(weighted_yields_flag, yieldfp, si, &yieldweight))) {
    sentenceno++;

    if (summaryfp && debuglevel >= 10000) {
      size_t	i;
      fprintf(tracefp, "\nSentence %ld:\n", sentenceno);

      for (i=0; i<terms->n; i++)
	fprintf(tracefp, " %s", si_index_string(si, terms->e[i]));
      fprintf(tracefp, "\n");
    }
 
    /* skip if sentence is too long */
    if (!maxsentlen || (int) terms->n <= maxsentlen) {
      inside = inside_chart(terms, g, si, wordscale);
      /* chart_display(stdout, inside, terms->n, si);  */
      root_prob = sihashf_ref(CHART_ENTRY(inside, 0, terms->n), 
			      g->root_label);

      if (root_prob > 0.0) {
	if (tracefp && debuglevel >= 10000)
	  fprintf(tracefp, "Sum of derivation weights = %g\n", 
		  root_prob);
	sum_neglog_prob -= yieldweight*(log(root_prob)-terms->n*log(wordscale));
	*sum_yieldweights += yieldweight*terms->n;
	parsed_sentences++;
	outside = outside_chart(g, si, inside, terms, yieldweight, rule_counts);
	/* assert(consistent_preterm_outsides(outside, terms, root_prob)); */
	/* chart_display(stdout, outside, terms->n, si); */
	chart_free(outside, terms->n);
      }
      else {
	failed_sentences++;
	if (tracefp && debuglevel >= 10000)
	  fprintf(tracefp, "Failed to parse.\n");
      }
      chart_free(inside, terms->n);		/* free the chart */
    }
    else { 					/* sentence too long */
      if (tracefp && debuglevel >= 10000)
	fprintf(tracefp, "Too long to parse.\n");
    }
    vindex_free(terms);				/*  and its terms */
  }

  /* free unary closure */
  free_unary_closure(g);

  if (summaryfp && debuglevel >= 1000) {
    if (failed_sentences>0)
      fprintf(summaryfp, " %ld sentences failed to parse",
	      (long) failed_sentences);
  }
  return(sum_neglog_prob);
}
Beispiel #9
0
static void
binary_outside(const grammar g, size_t left_pos, size_t right_pos, 
	       size_t nwords, const chart inside_chart, chart outside_chart, 
	       FLOAT root_prob, FLOAT *rule_counts)
{
  si_index  right_cat;
  FLOAT	    *completes;
  sihashf   child_outside = make_sihashf(CHART_CELLS);
  sihashf   child_inside = CHART_ENTRY(inside_chart, left_pos, right_pos);

  CHART_ENTRY(outside_chart, left_pos, right_pos) = child_outside;
  
  /* if the inside chart cell is empty, then there's no point calculating
   * the outside cell
   */

  if (sihashf_size(child_inside) == 0)
    return;

  completes = MALLOC((g->nnts+1)*sizeof(FLOAT));
  { size_t child_cat;
    for (child_cat=1; child_cat<=g->nnts; child_cat++)
      completes[child_cat] = 0.0;   
  } 

  /* try to combine with cells on left */

  for (right_cat=1; right_cat<=g->nnts; right_cat++) {
    brule bp = g->brules[right_cat];
    FLOAT right_inside_weight = sihashf_ref(child_inside, right_cat);

    /* rules and inside category? */
    if (bp&&(right_inside_weight>0.0)) {
      for ( ; bp; bp=bp->next) {
	size_t far_left_pos;
	for (far_left_pos=0; far_left_pos<left_pos; far_left_pos++) {
	  FLOAT far_left_inside_weight =
	    sihashf_ref(CHART_ENTRY(inside_chart, far_left_pos, left_pos), 
			bp->left);
	  if (far_left_inside_weight>0.0) {
	    rulelist rp;
	    if (bp->active_parent) {
	      FLOAT parent_outside_weight =
		sihashf_ref(CHART_ENTRY(outside_chart,far_left_pos,right_pos), 
			    bp->active_parent);
	      if (parent_outside_weight>0.0)
		completes[right_cat] += parent_outside_weight*
		                        far_left_inside_weight;
	    }
	    for (rp=bp->completes; rp; rp=rp->next) {
	      FLOAT parent_outside_weight = 
		sihashf_ref(CHART_ENTRY(outside_chart,far_left_pos,right_pos), 
			    g->rules[rp->ruleid]->e[0]);
	      if (parent_outside_weight>0.0) {
		FLOAT parent_left_rule_weight = parent_outside_weight*
		                                far_left_inside_weight*
                                                g->weights[rp->ruleid];
		completes[right_cat] += parent_left_rule_weight ;
		rule_counts[rp->ruleid] += parent_left_rule_weight*
		                           right_inside_weight/root_prob;
	      }}}}}}}

  /* try to combine with cells on right */

  for (right_cat=1; right_cat<=g->nnts; right_cat++) {
    brule bp;
    size_t far_right_pos;
    for (bp=g->brules[right_cat]; bp; bp=bp->next)
      for (far_right_pos=right_pos+1; far_right_pos<=nwords; far_right_pos++) {
	FLOAT far_right_inside_weight = 
	      sihashf_ref(CHART_ENTRY(inside_chart, right_pos, far_right_pos),
			  right_cat);
	si_index child_cat=bp->left;
	FLOAT    child_inside_weight = sihashf_ref(child_inside, child_cat);
	if ((far_right_inside_weight>0.0)&&(child_inside_weight>0.0)) {
	  rulelist rp;
	  if (bp->active_parent) {
	    FLOAT parent_outside_weight =
	      sihashf_ref(CHART_ENTRY(outside_chart, left_pos, far_right_pos),
			  bp->active_parent);
	    if (parent_outside_weight>0.0) {
	      if (child_cat<=g->nnts)
		completes[child_cat] += parent_outside_weight*
		                        far_right_inside_weight;
	      else {
		assert(child_cat>g->ncats); /* otherwise child_cat is a term */
		sihashf_inc(child_outside, child_cat, 
			    parent_outside_weight*far_right_inside_weight);
	      }
	    }}
	  for (rp=bp->completes; rp; rp=rp->next) {
	    FLOAT parent_outside_weight =
	      sihashf_ref(CHART_ENTRY(outside_chart, left_pos, far_right_pos),
			  g->rules[rp->ruleid]->e[0]);
	    if (parent_outside_weight>0.0) {
	      FLOAT parent_right_rule_weight = parent_outside_weight*
		                               far_right_inside_weight*
		                               g->weights[rp->ruleid];
	      if (child_cat<=g->nnts)
		completes[child_cat] += parent_right_rule_weight;
	      else {
		assert(child_cat>g->ncats); /* otherwise child_cat is a term */
		sihashf_inc(child_outside,child_cat,parent_right_rule_weight);
	      }
	      /* don't double count the rule! 
               * it's been counted before from the left 
	       */
	      /* rule_counts[rp->ruleid] += parent_right_rule_weight*
                                            child_inside_weight/root_prob; 
	       */
	    }}}}}

  /* unary closure */  
  /* unary closure for root cell done in outside_chart() */

  { si_index parent_cat;
  
    for (parent_cat=1; parent_cat<=g->nnts; parent_cat++) {
      FLOAT parent_outside_weight = completes[parent_cat];
      if (parent_outside_weight>0.0) {
	catproblist cp;
	for (cp = g->parent_childprob[parent_cat]; cp; cp = cp->next) 
	  if (sihashf_ref(child_inside, cp->cat) > 0.0)
	    sihashf_inc(child_outside, cp->cat, cp->prob*parent_outside_weight);
      }}}

  /* increment unary rule_counts */ 
  /* rule counts for root cell done in outside_chart() */
  increment_unary_counts(g, child_inside, child_outside, root_prob,
			 rule_counts);

  FREE(completes);
}
Beispiel #10
0
static chart
inside_chart(vindex terms, grammar g, si_t si, FLOAT wordscale)
{
  int left, right, mid;
  chart c = chart_make(terms->n);

  /* parent_completes is a sihashf of completed categories (i.e., not the
   * new, active categories produced by binarization).  Unary closure
   * applies to these categories.
   * The pre-unary-closure parent weights are stored in parent_completes
   * before unary closure is applied to them
   */

  /* Inside pass */

  /* insert lexical items */

  for (left=0; left< (int) terms->n; left++) {
    rulelist    rl;
    si_index	terminal = terms->e[left];
    sihashf	chart_entry = make_sihashf(NLABELS);

    CHART_ENTRY(c, left, left+1) = chart_entry;

    assert(terminal>0);
    if (terminal<=g->nnts) {
      fprintf(stderr, 
	      "Error in inside_chart() in expected-counts.c: "
	      "input contains nonterminal symbol %s\n", 
	      si_index_string(si, terminal));
      exit(EXIT_FAILURE);
    }
    if (terminal>g->ncats) {
      fprintf(stderr, 
	      "Error in inside_chart() in expected-counts.c:"
	      " input contains unknown terminal %s\n", 
	      si_index_string(si, terminal));
      exit(EXIT_FAILURE);
    }

    /* no need to actually enter terminal into chart */
    /* sihashf_set(chart_entry, terminal, 1.0); */

    rl = g->urules[terminal];
    assert(rl);   /* check there are rules for this terminal */
    for ( ; rl; rl=rl->next) {
      si_index preterminal = g->rules[rl->ruleid]->e[0];
      FLOAT preterminal_prob = g->weights[rl->ruleid]*wordscale;
      catproblist pp;
      /* assert(rl->ruleid<g->nrules); */
      assert(g->child_parentprob[preterminal]);
      for (pp = g->child_parentprob[preterminal]; pp; pp = pp->next)
	sihashf_inc(chart_entry, pp->cat, preterminal_prob*pp->prob);
    }

    /* fprintf(stderr, "Chart entry %d-%d\n", (int) left, (int) left+1);
       chart_entry_display(stderr, chart_entry, si); */
  }

  for (right=2; right<= (int) terms->n; right++)
    for (left=right-2; left>=0; left--) {
      sihashf parent_completes = make_sihashf(COMPLETE_CELLS);
      sihashf chart_entry = make_sihashf(CHART_CELLS);   
      CHART_ENTRY(c, left, right) = chart_entry;

      for (mid=left+1; mid<right; mid++) 
	binary_inside(g, chart_entry, parent_completes,
		      CHART_ENTRY(c,left,mid), CHART_ENTRY(c,mid,right));

      unary_closure_inside(g, chart_entry, parent_completes);
      free_sihashf(parent_completes);
      
      /* fprintf(stdout, "Chart entry %d-%d\n", (int) left, (int) right); */
      /* chart_entry_display(stdout, CHART_ENTRY(inside,left,right), si); */   
    }

  return c;
}