void write_local_trees(const vihashl localtree_ht, si_t si, si_index root)
{
  vihashlit hit;
  vindex  vi;
  long    count;
  size_t  i;
  char    *string;

  for (hit = vihashlit_init(localtree_ht); vihashlit_ok(hit); hit = vihashlit_next(hit)) {
    vi = (vindex) hit.key;
    assert(vi->n > 0);
    assert(vi->n <= MAXRHS);
    if (vi->e[0] != root)
      continue;
    count = hit.value;
    string =  si_index_string(si, vi->e[0]);
    assert(string);
    printf("%ld\t%s " REWRITES, count, string);
    
    for (i=1; i<vi->n; i++) {
      string = si_index_string(si, vi->e[i]);
      assert(string);
      printf(" %s", string);
    }

    printf("\n");
  }

  for (hit = vihashlit_init(localtree_ht); vihashlit_ok(hit); hit = vihashlit_next(hit)) {
    vi = (vindex) hit.key;
    assert(vi->n > 0);
    assert(vi->n <= MAXRHS);
    if (vi->e[0] == root)
      continue;
    count = hit.value;
    string =  si_index_string(si, vi->e[0]);
    assert(string);
    printf("%ld\t%s " REWRITES, count, string);
    
    for (i=1; i<vi->n; i++) {
      string = si_index_string(si, vi->e[i]);
      assert(string);
      printf(" %s", string);
    }

    printf("\n");
  }
}
Beispiel #2
0
static void
chart_entry_display(FILE *fp, sihashf chart_entry, si_t si)
{
  sihashfit hit;

  for (hit=sihashfit_init(chart_entry); sihashfit_ok(hit); 
       hit = sihashfit_next(hit))
    fprintf(fp, " %s: %g\n", si_index_string(si, hit.key), (float) hit.value);
}
Beispiel #3
0
void 
write_grammar(FILE *fp, grammar g, si_t si) 
{
  sihashbrsit	bhit;
  sihashursit	uhit;
  size_t	i;

  for (bhit=sihashbrsit_init(g.brs); sihashbrsit_ok(bhit); bhit=sihashbrsit_next(bhit)) 
    for (i=0; i<bhit.value.n; i++) 
      fprintf(fp, "%g	%s " REWRITES " %s %s\n", (double) bhit.value.e[i]->prob, 
	      si_index_string(si, bhit.value.e[i]->parent),
	      si_index_string(si, bhit.value.e[i]->left),
	      si_index_string(si, bhit.value.e[i]->right));

  for (uhit=sihashursit_init(g.urs); sihashursit_ok(uhit); uhit=sihashursit_next(uhit)) 
    for (i=0; i<uhit.value.n; i++) 
      fprintf(fp, "%g	%s " REWRITES " %s\n", (double) uhit.value.e[i]->prob, 
	      si_index_string(si, uhit.value.e[i]->parent),
	      si_index_string(si, uhit.value.e[i]->child));
}
Beispiel #4
0
int      
main(int argc, char **argv)
{
  si_t          si = make_si(1024);
  FILE          *grammarfp = stdin, *yieldfp;
  FILE		*tracefp = NULL;  	/* trace output */
  FILE		*summaryfp = stderr;	/* end of parse stats output */
  FILE		*parsefp = stdout;      /* parse trees */
  FILE		*probfp = NULL;         /* max_neglog_prob */

  chart_cell	root_cell;
  grammar	g;
  chart		c;
  vindex 	terms;
  int		maxsentlen = 0;
  int           sentenceno = 0, parsed_sentences = 0, failed_sentences = 0;
  double	sum_neglog_prob = 0;
  int           sentfrom = 0;
  int           sentto = 0;

  srand(RAND_SEED);	/* seed random number generator */

  if (argc<2 || argc>6) {
    fprintf(stderr, "%s yieldfile [maxsentlen [grammarfile [sentfrom sentto]]]\n", argv[0]);
    exit(EXIT_FAILURE);
  }

  if ((yieldfp = fopen(argv[1], "r")) == NULL) {
    fprintf(stderr, "%s: Couldn't open yieldfile %s\n", argv[0], argv[1]);
    exit(EXIT_FAILURE);
  }

  if (argc >= 3)
    if (!sscanf(argv[2], "%d", &maxsentlen)) {
      fprintf(stderr, "%s: Couldn't parse maxsentlen %s\n", argv[0], argv[2]);
      exit(EXIT_FAILURE);
    }

  if (argc >= 4)
    if ((grammarfp = fopen(argv[3], "r")) == NULL) {
      fprintf(stderr, "%s: Couldn't open grammarfile %s\n", argv[0], argv[3]);
      exit(EXIT_FAILURE);
    }

  if (argc >= 6) {
    if (!sscanf(argv[4], "%d", &sentfrom)) {
      fprintf(stderr, "%s: Couldn't parse sentfrom %s\n", argv[0], argv[4]);
      exit(EXIT_FAILURE);
    }
    if (!sscanf(argv[5], "%d", &sentto)) {
      fprintf(stderr, "%s: Couldn't parse sentto %s\n", argv[0], argv[5]);
      exit(EXIT_FAILURE);
    }
  }

  g = read_grammar(grammarfp, si);
  /* write_grammar(tracefp, g, si); */

  while ((terms = read_terms(yieldfp, si))) {
    sentenceno++;

    if (sentfrom && sentenceno < sentfrom) {
      vindex_free(terms);
      continue;
    }
    if (sentto && sentenceno > sentto) {
      vindex_free(terms);
      break;
    }

    /* skip if sentence is too long */
    if (!maxsentlen || (int) terms->n <= maxsentlen) { 
      size_t	i;

      if (tracefp) {
	fprintf(tracefp, "\nSentence %d:\n", sentenceno);
	for (i=0; i<terms->n; i++)
	  fprintf(tracefp, " %s", si_index_string(si, terms->e[i]));
	fprintf(tracefp, "\n");
      }
     
      c = cky(*terms, g, si);

      /* fetch best root node */

      root_cell = sihashcc_ref(CHART_ENTRY(c, 0, terms->n), g.root_label);

      if (root_cell) {
	tree parse_tree = bintree_tree(&root_cell->tree, si);
	double prob = (double) root_cell->prob;

	parsed_sentences++;
	assert(prob > 0.0);
	sum_neglog_prob -= log(prob);

	if (probfp)
	  fprintf(probfp, "max_neglog_prob(%d, %g).\n", 
		  sentenceno, -log(prob)); 

	if (tracefp) 
	  fprintf(tracefp, " Prob = %g\n", prob);

	if (parsefp) {
	  write_tree(parsefp, parse_tree, si);
	  fprintf(parsefp, "\n");
	  /* write_prolog_tree(parsefp, parse_tree, si); */
	}

	free_tree(parse_tree);
      }

      else {
	failed_sentences++;
	if (tracefp)
	  fprintf(tracefp, "Failed to parse\n");
	if (parsefp)
	  fprintf(parsefp, "parse_failure.\n");
      }

      chart_free(c, terms->n);			/* free the chart */
    }
    else { 					/* sentence too long */
      if (parsefp)
	fprintf(parsefp, "too_long.\n");
    }

    vindex_free(terms);				/*  free the terms */
    assert(trees_allocated == 0);
    assert(bintrees_allocated == 0);
  }
  free_grammar(g);
  si_free(si);

  if (summaryfp) {
    fprintf(summaryfp, "\n%d/%d = %g%% test sentences met the length criteron,"
	    " of which %d/%d = %g%% were parsed\n", 
	    parsed_sentences+failed_sentences, sentenceno,
	    (double) (100.0 * (parsed_sentences+failed_sentences)) / 
	                       sentenceno,
	    parsed_sentences, parsed_sentences+failed_sentences, 
	    (double) (100.0 * parsed_sentences) / 
                              (parsed_sentences + failed_sentences));
    fprintf(summaryfp, "Sum(-log prob) = %g\n", sum_neglog_prob);
  }

  /* check that everything has been deallocated */
  /* printf("mmm_blocks_allocated = %ld\n", (long) mmm_blocks_allocated); */
  assert(mmm_blocks_allocated == 0);		
  exit(EXIT_SUCCESS);
}
Beispiel #5
0
FLOAT 
expected_rule_counts(const grammar g, const si_t si, FILE *yieldfp, 
		     FILE *tracefp, FILE *summaryfp, int debuglevel,
		     int maxsentlen, FLOAT minruleprob, FLOAT wordscale,
		     FLOAT *rule_counts, FLOAT *sum_yieldweights,
		     int weighted_yields_flag)
{
  vindex  terms;
  FLOAT	  root_prob;
  chart	  inside, outside;
  long    sentenceno = 0, parsed_sentences = 0, failed_sentences = 0;
  double  sum_neglog_prob = 0.0;
  float   yieldweight;

  *sum_yieldweights = 0;
  /*  FLOAT *rule_counts = CALLOC(g->nrules, sizeof(FLOAT)); */

  { size_t i;                    /* zero rule counts */
    for (i=0; i<g->nrules; i++)
      rule_counts[i] = 0.0;
  }

  compute_unary_closure(g, minruleprob); /* compute unary_close */

  rewind(yieldfp);               /* rewind the tree file */

  while ((terms = read_terms(weighted_yields_flag, yieldfp, si, &yieldweight))) {
    sentenceno++;

    if (summaryfp && debuglevel >= 10000) {
      size_t	i;
      fprintf(tracefp, "\nSentence %ld:\n", sentenceno);

      for (i=0; i<terms->n; i++)
	fprintf(tracefp, " %s", si_index_string(si, terms->e[i]));
      fprintf(tracefp, "\n");
    }
 
    /* skip if sentence is too long */
    if (!maxsentlen || (int) terms->n <= maxsentlen) {
      inside = inside_chart(terms, g, si, wordscale);
      /* chart_display(stdout, inside, terms->n, si);  */
      root_prob = sihashf_ref(CHART_ENTRY(inside, 0, terms->n), 
			      g->root_label);

      if (root_prob > 0.0) {
	if (tracefp && debuglevel >= 10000)
	  fprintf(tracefp, "Sum of derivation weights = %g\n", 
		  root_prob);
	sum_neglog_prob -= yieldweight*(log(root_prob)-terms->n*log(wordscale));
	*sum_yieldweights += yieldweight*terms->n;
	parsed_sentences++;
	outside = outside_chart(g, si, inside, terms, yieldweight, rule_counts);
	/* assert(consistent_preterm_outsides(outside, terms, root_prob)); */
	/* chart_display(stdout, outside, terms->n, si); */
	chart_free(outside, terms->n);
      }
      else {
	failed_sentences++;
	if (tracefp && debuglevel >= 10000)
	  fprintf(tracefp, "Failed to parse.\n");
      }
      chart_free(inside, terms->n);		/* free the chart */
    }
    else { 					/* sentence too long */
      if (tracefp && debuglevel >= 10000)
	fprintf(tracefp, "Too long to parse.\n");
    }
    vindex_free(terms);				/*  and its terms */
  }

  /* free unary closure */
  free_unary_closure(g);

  if (summaryfp && debuglevel >= 1000) {
    if (failed_sentences>0)
      fprintf(summaryfp, " %ld sentences failed to parse",
	      (long) failed_sentences);
  }
  return(sum_neglog_prob);
}
Beispiel #6
0
static chart
inside_chart(vindex terms, grammar g, si_t si, FLOAT wordscale)
{
  int left, right, mid;
  chart c = chart_make(terms->n);

  /* parent_completes is a sihashf of completed categories (i.e., not the
   * new, active categories produced by binarization).  Unary closure
   * applies to these categories.
   * The pre-unary-closure parent weights are stored in parent_completes
   * before unary closure is applied to them
   */

  /* Inside pass */

  /* insert lexical items */

  for (left=0; left< (int) terms->n; left++) {
    rulelist    rl;
    si_index	terminal = terms->e[left];
    sihashf	chart_entry = make_sihashf(NLABELS);

    CHART_ENTRY(c, left, left+1) = chart_entry;

    assert(terminal>0);
    if (terminal<=g->nnts) {
      fprintf(stderr, 
	      "Error in inside_chart() in expected-counts.c: "
	      "input contains nonterminal symbol %s\n", 
	      si_index_string(si, terminal));
      exit(EXIT_FAILURE);
    }
    if (terminal>g->ncats) {
      fprintf(stderr, 
	      "Error in inside_chart() in expected-counts.c:"
	      " input contains unknown terminal %s\n", 
	      si_index_string(si, terminal));
      exit(EXIT_FAILURE);
    }

    /* no need to actually enter terminal into chart */
    /* sihashf_set(chart_entry, terminal, 1.0); */

    rl = g->urules[terminal];
    assert(rl);   /* check there are rules for this terminal */
    for ( ; rl; rl=rl->next) {
      si_index preterminal = g->rules[rl->ruleid]->e[0];
      FLOAT preterminal_prob = g->weights[rl->ruleid]*wordscale;
      catproblist pp;
      /* assert(rl->ruleid<g->nrules); */
      assert(g->child_parentprob[preterminal]);
      for (pp = g->child_parentprob[preterminal]; pp; pp = pp->next)
	sihashf_inc(chart_entry, pp->cat, preterminal_prob*pp->prob);
    }

    /* fprintf(stderr, "Chart entry %d-%d\n", (int) left, (int) left+1);
       chart_entry_display(stderr, chart_entry, si); */
  }

  for (right=2; right<= (int) terms->n; right++)
    for (left=right-2; left>=0; left--) {
      sihashf parent_completes = make_sihashf(COMPLETE_CELLS);
      sihashf chart_entry = make_sihashf(CHART_CELLS);   
      CHART_ENTRY(c, left, right) = chart_entry;

      for (mid=left+1; mid<right; mid++) 
	binary_inside(g, chart_entry, parent_completes,
		      CHART_ENTRY(c,left,mid), CHART_ENTRY(c,mid,right));

      unary_closure_inside(g, chart_entry, parent_completes);
      free_sihashf(parent_completes);
      
      /* fprintf(stdout, "Chart entry %d-%d\n", (int) left, (int) right); */
      /* chart_entry_display(stdout, CHART_ENTRY(inside,left,right), si); */   
    }

  return c;
}
Beispiel #7
0
grammar
read_grammar(FILE *fp, si_t si) 
{
  sihashbrs left_brules_ht = make_sihashbrs(NLABELS);
  sihashurs child_urules_ht = make_sihashurs(NLABELS);
  sihashf parent_weight_ht = make_sihashf(NLABELS);
  brihashbr brihtbr = make_brihashbr(NLABELS);
  int n;
  double weight;
  urule ur;
  sihashbrsit bhit;
  sihashursit uhit;
  size_t  root_label = 0, lhs, cat, rhs[MAXRHS];

  while ((n = fscanf(fp, " %lg ", &weight)) == 1) {	/* read the count */
    lhs = read_cat(fp, si);
    assert(weight > 0);
    assert(lhs);
    if (!root_label)
      root_label = lhs;
    
    fscanf(fp, " " REWRITES);				/* read the rewrites symbol */

    for (n=0; n<MAXRHS; n++) {				/* read the rhs, n is length of rhs */
      cat = read_cat(fp, si);
      if (!cat)
	break;
      rhs[n] = cat;
    }

    if (n >= MAXRHS) {
      fprintf(stderr, "read_grammar() in grammar.c: rule rhs too long\n");
      exit(EXIT_FAILURE);
    }

    switch (n) {
    case 0: 
      fprintf(stderr, "read_grammar() in grammar.c: rule with empty rhs\n");
      exit(EXIT_FAILURE);
      break;
    case 1: 
      ur = make_urule(weight, lhs, rhs[0]);
      push_urule(child_urules_ht, ur->child, ur);
      sihashf_inc(parent_weight_ht, ur->parent, weight);
      break;
    case 2:
      add_brule(left_brules_ht, brihtbr, weight, lhs, rhs[0], rhs[1]);
      sihashf_inc(parent_weight_ht, lhs, weight);
      break;
    default: 
      { int start, i, j;
        char bcat[MAXBLABELLEN], *s;
	si_index bparent, left, right;

	right = rhs[n-1];		/* rightmost category */
	for (start=n-2; start>=1; start--) {
	  
	  i = 0;			/* i is index into bcat[] */
	  for (j=start; j<n; j++) {     /* j is index into rhs[] */
	    if (j!=start) {
	      bcat[i++] = BINSEP;
	      assert(i < MAXBLABELLEN);
	    }
	    
	    s = si_index_string(si, rhs[j]);
	    while (*s) {
	      bcat[i++] = *s++;
	      assert(i < MAXBLABELLEN);
	  }}

	  bcat[i] = '\0';
	  bparent = si_string_index(si, bcat);
	  left = rhs[start];
	  add_brule(left_brules_ht, brihtbr, weight, bparent, left, right);
	  sihashf_inc(parent_weight_ht, bparent, weight);
	  right = bparent;
	}
	
	add_brule(left_brules_ht, brihtbr, weight, lhs, rhs[0], right);
	sihashf_inc(parent_weight_ht, lhs, weight);
      }}}
  
  free_brihashbr(brihtbr);	/* free brindex hash table */

  { 
    int i; /* normalize grammar rules */

    for (bhit = sihashbrsit_init(left_brules_ht); sihashbrsit_ok(bhit); bhit = sihashbrsit_next(bhit))
      for (i=0; i<bhit.value.n; i++) 
	bhit.value.e[i]->prob /= sihashf_ref(parent_weight_ht, bhit.value.e[i]->parent);

    for (uhit = sihashursit_init(child_urules_ht); sihashursit_ok(uhit); uhit = sihashursit_next(uhit))
      for (i=0; i<uhit.value.n; i++) 
	uhit.value.e[i]->prob /= sihashf_ref(parent_weight_ht, uhit.value.e[i]->parent);
  }
  
  free_sihashf(parent_weight_ht);
 
  {
    grammar g;
    g.urs = child_urules_ht;
    g.brs = left_brules_ht;
    g.root_label = root_label;
    return g;
  }
}