示例#1
0
static void
read_1rule(s3_cfg_t *_cfg, FILE *_file, float32 *_score,
           s3_cfg_id_t *_src, s3_cfg_id_t *_products)
{
  char name[S3_CFG_MAX_ITEM_STR_LEN + 1];
  float32 score;
  s3_cfg_id_t src;
  s3_cfg_id_t products[S3_CFG_MAX_ITEM_COUNT + 1];
  s3_cfg_id_t item;
  char format[1024];
  int len;
  int i;

  assert(_cfg != NULL);
  assert(_file != NULL);

  sprintf(format, "%%%ds", S3_CFG_MAX_ITEM_STR_LEN);

  /* read the prior */
  if (fscanf(_file, "%f", &score) != 1 || score < 0)
    E_FATAL("Bad CFG production rule\n");

  /* read the source */
  if (fscanf(_file, format, name) != 1)
    E_FATAL("Bad CFG production rule\n");

  src = s3_cfg_str2id(_cfg, name);
  if (src == S3_CFG_INVALID_ID)
    E_FATAL("Bad CFG production rule\n");

  if (s3_cfg_is_terminal(src))
    E_FATAL("Bad CFG production rule\n");

  if (fscanf(_file, "%d", &len) != 1)
    E_FATAL("Bad CFG production rule\n");

  if (len > S3_CFG_MAX_ITEM_COUNT)
    E_FATAL("CFG Production rule too long\n");

  /* read the products */
  for (i = 0; i < len; i++) {
    if (fscanf(_file, format, name) != 1)
      E_FATAL("Bad CFG production rule\n");

    item = s3_cfg_str2id(_cfg, name);
    if (item == S3_CFG_INVALID_ID)
      E_FATAL("Bad CFG production term\n");
    products[i] = item;
  }
  products[len] = S3_CFG_EOR_ITEM;

  *_src = src;
  *_score = score;
  memcpy(_products, products, (len + 1) * sizeof(s3_cfg_id_t));

}
示例#2
0
void
s3_cfg_compile_rules(s3_cfg_t *_cfg, logmath_t *logmath)
{
  s3_cfg_item_t *item = NULL;
  s3_arraylist_t *arraylist = NULL;
  int i, n;

  assert(_cfg != NULL);

  arraylist = &_cfg->item_info;
  n = s3_arraylist_count(arraylist);
  for (i = n - 1; i >= 0; i--) {
    item = s3_arraylist_get(arraylist, i);
    if (!s3_cfg_is_terminal(item->id))
      compile_nonterm(_cfg, item, logmath);
  }

  _cfg->predictions = (int8 *)ckd_calloc(n, sizeof(int8));
}
static void
convert_cfg_rule(s3_cfg_t *_cfg,
		 s2_fsg_t *_fsg,
		 s3_cfg_rule_t *_rule,
		 int _src,
		 int _dest,
		 int *_expansions,
		 param_t *_params)
{
  int index;
  int i, j, n;
  int cur, u, v;
  s3_cfg_id_t id;
  s3_cfg_item_t *item;
  s3_cfg_rule_t *rule;
  s2_fsg_trans_t *trans;

  cur = _src;

  /* Check whether the target rule has any variables that exceeded the
   * expansion count
   */
  for (i = 0; i < _rule->len; i++) {
    id = _rule->products[i];
    if (_expansions[s3_cfg_id2index(id)] > S3_CFG_MAX_FSG_EXPANSION)
      return;
  }

  /* Iterate through the production variables. */
  for (i = 0; i < _rule->len; i++) {
    id = _rule->products[i];

    /* For each terminal:
     *   1.  Create a new state.
     *   2.  Add a single definite transition from the current state to the
     *       new state that emits the terminal.
     *   3.  Use the new state as the current state.
     */
    if (s3_cfg_is_terminal(id)) {
      trans = (s2_fsg_trans_t*)ckd_calloc(1, sizeof(s2_fsg_trans_t));
      trans->from_state = cur;
      trans->to_state = _fsg->n_state++;
      trans->prob = 1.0;
      trans->word = (char *)ckd_salloc(s3_cfg_id2str(_cfg, id));
      trans->next = _fsg->trans_list;
      _fsg->trans_list = trans;

      cur = _fsg->n_state;
    }

    /* For each non-terminal X:
     *   1.  Create a new destination state, v.
     *   2.  Increment expansion count for X.
     *   3.  For each cfg rule with X as source:
     *      a.  Create a new source state, u.
     *      b.  Convert the rule with u as src and v as dest.
     *      d.  Create a new epsilon transition from the current state to u 
     *          with the rule's expansion probability.
     *   4.  Set the current state to v.
     *   5.  Decrement expansion count for X.
     */
    else {
      index = s3_cfg_id2index(id);
      v = _fsg->n_state++;
      _expansions[index]++;
      item = (s3_cfg_item_t *)s3u_arraylist_get(&_cfg->item_info, index);
      n = s3u_arraylist_count(&item->rules);
      for (j = 0; j < n; j++) {
	rule = (s3_cfg_rule_t *)s3u_arraylist_get(&item->rules, j);
	u = _fsg->n_state++;
	convert_cfg_rule(_cfg, _fsg, rule, u, v, _expansions, _params);
	
	trans = (s2_fsg_trans_t*)ckd_calloc(1, sizeof(s2_fsg_trans_t));
	trans->from_state = cur;
	trans->to_state = u;
	trans->prob = rule->prob_score;
	trans->word = NULL;
	trans->next = _fsg->trans_list;
	_fsg->trans_list = trans;
      }
	
      cur = v;
      _expansions[index]--;
    }
  }
}
/* Attempt to convert a CFG to a FSG.  No heuristic simplifcation is performed
 * here.  The conversion will only take place if all expansion rule in the 
 * CFG takes one of the following form
 * 
 *   X -> w0 w1 ... wN Y
 *   X -> w0 w1 ... wN
 *   X -> Y
 *   X -> w0
 *   X -> nil
 *
 * where X, Y are non-terminals, and w0, ..., wN are terminals.  If the
 * conversion is not possible, _fsg is set to NULL and the function returns -1.
 */
void
s3_cfg_convert_to_fsg(s3_cfg_t *_cfg,s2_fsg_t **_fsg)
{
  hash_table_t *item2state=NULL;
  int num_states=1; /* let 0 be the end state */
  int start_state=0;
  int from_state;
  int to_state;
  int i=0,j=0;
  s3_cfg_id_t id;
  s3u_vector_t *items=NULL;
  s3u_vector_t *rules=NULL;
  s3_cfg_item_t *item=NULL;
  s3_cfg_rule_t *rule=NULL;
  s2_fsg_t *fsg=NULL;
  char *word;

  item2state=hash_new(S3_CFG_NAME_HASH_SIZE,HASH_CASE_YES);
  if (item2state==NULL)
    goto cleanup;

  fsg=(s2_fsg_t*)ckd_calloc(1,sizeof(s2_fsg_t));
  fsg->name=NULL;
  fsg->trans_list=NULL;

  items=&_cfg->item_info;
  for (i=s3u_vec_count(items)-1;i>=0;i--) {
    if ((item=s3u_vec_get(items,i))==NULL)
      goto cleanup;

    rules=&item->rules;
    if (!s3_cfg_is_terminal(item->id) &&
	(item->nil_rule!=NULL || (rules!=NULL && s3u_vec_count(rules)>0)))
      hash_enter_bkey(item2state,&item->id,sizeof(s3_cfg_id_t),num_states++);
  }

  /* iterate through the CFG's expansion rules and convert them to FSG
   * transitions.  If at any point the conversion fails, do some cleanup
   * and return.
   */
  rules=&_cfg->rules;
  for (i=s3u_vec_count(rules)-1;i>=0;i--) {
    if ((rule=s3u_vec_get(rules,i))==NULL)
      goto cleanup;

    hash_lookup_bkey(item2state,&rule->src,sizeof(s3_cfg_id_t),&from_state);

    /* a NULL production rule means we transition to the end state */
    if (rule->len==0)
      add_trans(fsg,from_state,0,rule->prob_score,NULL);
    else if (rule->len==1) {
      id=rule->products[0];
      /* a single terminal means we output the terminal and transition to 
       * the end state
       */
      if (s3_cfg_is_terminal(id)) {
	word=((s3_cfg_item_t*)s3u_vec_get(items,s3_cfg_id2index(id)))->name;
	add_trans(fsg,from_state,0,rule->prob_score,word);
      }
      /* a single non-terminal means we take an epsilon transition */
      else {
	hash_lookup_bkey(item2state,&id,sizeof(s3_cfg_id_t),&to_state);
	add_trans(fsg,from_state,to_state,rule->prob_score,NULL);
      }
    }
    else {
      for (j=1;j<rule->len;j++) {

	/* get the output for the transition */
	id=rule->products[j-1];
	if (!s3_cfg_is_terminal(id))
	  goto cleanup;
	word=((s3_cfg_item_t*)s3u_vec_get(items,s3_cfg_id2index(id)))->name;

	/* get the target state for the transition */
	id=rule->products[j];
	if (s3_cfg_is_terminal(id))
	  to_state=num_states++;
	else
	  hash_lookup_bkey(item2state,&id,sizeof(s3_cfg_id_t),&to_state);

	add_trans(fsg,from_state,to_state,j==1?rule->prob_score:1.0,word);

	from_state=to_state;
      }
    }
  }
  
  *_fsg=fsg;
  return 0;

 cleanup:
  if (fsg!=NULL)
    free_fsg(fsg);

  return -1;
}
示例#5
0
static void
eval_state(s3_cfg_t *_cfg, s3_cfg_state_t *_state)
{
  s3_cfg_rule_t *rule = NULL;
  s3_cfg_entry_t *entry = NULL;
  s3_cfg_entry_t *cmplt_entry = NULL;
  s3_cfg_state_t *target_state = NULL;
  s3_cfg_state_t *origin_state = NULL;
  s3_cfg_item_t *item = NULL;
  s3_cfg_id_t scan;
  s3_arraylist_t *arraylist = NULL;
  int8 *predictions = NULL;
  int32 score;
  int index;
  int dot;
  int i, j;

  assert(_cfg != NULL);
  assert(_state != NULL);

  if (_state->back != NULL) {
    _state->back->num_expanded++;
  }
  _state->num_expanded = 0;

  predictions = _cfg->predictions;
  memset(predictions, 0, _cfg->item_info.count * sizeof(int8));

  /* iterate thru the entries in the state and perform prediction, scan,
   * and completion steps */
  for (i = 0; i < _state->entries.count; i++) {
    entry = (s3_cfg_entry_t *)s3_arraylist_get(&_state->entries, i);
    rule = entry->rule;
    dot = entry->dot;
    origin_state = entry->origin;
    score = entry->score;
    
    scan = rule->products[dot];
    index = s3_cfg_id2index(scan);

    DEBUG_ENTRY(entry);

    item = (s3_cfg_item_t *)s3_arraylist_get(&_cfg->item_info, index);

    /* saving some scores */
    if (_state->best_overall_entry == NULL ||
        score < _state->best_overall_entry->score)
      _state->best_overall_entry = entry;
    
    if (_state->best_overall_parse == NULL ||
        score < _state->best_overall_parse->score)
      _state->best_overall_parse = entry;

    if (s3_cfg_is_terminal(scan)) {
      /************************************************************************
       * NORMAL COMPLETION
       *
       * When we encounter an entry of the form
       *
       *   $X -> (A * #EOR#, s0, i),
       *
       * we look for any entry in state S(i) of the form
       *
       *   $Z -> (A * $X B #EOR#, s1, j)
       *
       * and add the entry
       *
       *   $Z -> (A $X * B #EOR#, s1 + s2, j)
       *
       * to the current state.  We also need to keep a record of which
       * subparses were used to complete entries.  In this case, we need to
       * remember that this particular completed entry of $X is used to
       * advance the parsing of $Z.  In this case, the pointer p1 is added to
       * the entry
       *
       *   $Z -> (A $X(p1) * B #EOR#, s1 + s2, j)
       *
       * for records keeping sake.
       */
      if (scan == S3_CFG_EOR_ITEM) {
        scan = entry->rule->src;
        arraylist = &entry->origin->entries;

        for (j = s3_arraylist_count(arraylist) - 1; j >= 0; j--) {
          cmplt_entry = (s3_cfg_entry_t *)s3_arraylist_get(arraylist, j);

          if (cmplt_entry->rule->products[cmplt_entry->dot] == scan)
            add_entry(_state,
                      cmplt_entry->rule,
                      cmplt_entry->dot + 1,
                      cmplt_entry->origin,
                      cmplt_entry->score + entry->score,
                      cmplt_entry,
                      entry);
        }
      }
      /************************************************************************
       * PARSE COMPLETION
       *
       * We encountered an entry of the form
       *
       *   ($PSTART -> $START * #EOI#, s i).
       *
       * Instead of waiting for an input symbol #EOI# and completing the
       * pseudo-start rule in the next state, we finish the parse here and save
       * us a step.  We do need to check against other completed parses in this
       * state, since only the parse with the highest score is kept.
       */
      else if (scan == S3_CFG_EOI_ITEM) {
        if (_state->best_completed_entry == NULL ||
            score < _state->best_completed_entry->score)
          _state->best_completed_entry = entry;

        if (_state->best_completed_parse == NULL ||
            score < _state->best_completed_parse->score)
          _state->best_completed_parse = entry;

      }
      /************************************************************************
       * NORNAL SCANNING
       *
       * When we encounter an entry of the form
       *
       *   ($X -> A * y B #EOR#, s, i),
       *
       * and the input symbol/terminal is y, we add to the next state the entry
       *
       *   ($X -> A y * B #EOR#, s, i)
       */
      else {
        index = s3_cfg_id2index(scan);
        arraylist = &_state->expansions;
        target_state = (s3_cfg_state_t *)s3_arraylist_get(arraylist, index);
        if (target_state == NULL)
          target_state = add_state(_cfg, _state, scan);
        add_entry(target_state, rule, dot + 1, origin_state, score,
                  entry, NULL);
      }
    }
    else {
      /************************************************************************
       * AUTOMATIC COMPLETION OF EPSILON PRODUCING NON-TERMINALS
       *
       * When we encounter an entry of the form
       *
       *   ($X -> A * $Y B #EOR#, s0, i),
       *
       * we check whether $Y is a epsilon producing non-terminal, i.e.,
       * whether the rule
       * 
       *   $Y -> #EOR#
       * 
       * exists.  If that is the case, we do not add any entry corresponding to
       * such epsilon producing rule.  Instead, we take a short-cut by add the
       * following entry to the current state
       *   
       *   ($X -> A $Y(null) * B  #EOR#, s0 + s1, i).
       *
       * Note in this new entry, the completed non-terminal $Y has a NULL sub-
       * parse pointer.
       */
      if (item->nil_rule != NULL)
        add_entry(_state, rule, dot + 1, origin_state,
                  score + item->nil_rule->log_score, entry, NULL);

      /************************************************************************
       * NORMAL PREDICTION
       * 
       * When we encounter an entry of the form
       * 
       *   ($X -> A * $Y B #EOR#, s0, i),
       *
       * we want to expand the non-terminal $Y.  That is, we add an entry for
       * each rule that has $Y on its left-hand side.  However, we don't want
       * to keep repeated copies of the same entries, so we keep track of which
       * non-terminals we've already expanded in a table.
       */
      if (!predictions[index]) {
        predictions[index] = 1;
        arraylist = &item->rules;
        for (j = s3_arraylist_count(arraylist) - 1; j >= 0; j--) {
          rule = (s3_cfg_rule_t *)s3_arraylist_get(arraylist, j);
          if (rule->products[0] != S3_CFG_EOR_ITEM)
            add_entry(_state, rule, 0, _state, rule->log_score, NULL, NULL);
        }
      }
    }
  }
}
示例#6
0
s3_cfg_t *
s3_cfg_read_simple(const char *_fn)
{
  s3_cfg_t *cfg = NULL;
  FILE *file = NULL;
  s3_cfg_id_t src;
  s3_cfg_id_t products[S3_CFG_MAX_ITEM_COUNT + 1];
  s3_cfg_id_t item;
  char name[S3_CFG_MAX_ITEM_STR_LEN + 1];
  char format[1024];
  float32 score;
  int len;
  int i;

  assert(_fn != NULL);

  cfg = (s3_cfg_t *)ckd_calloc(1, sizeof(s3_cfg_t));
  s3_cfg_init(cfg);

  if ((file = fopen(_fn, "r")) == NULL)
    E_FATAL("Cannot open input plain cfg file");

  sprintf(format, "%%%ds", S3_CFG_MAX_ITEM_STR_LEN);

  while (!feof(file)) {
    /* read the prior */
    if (fscanf(file, "%f", &score) != 1 || score < 0)
      break;

    /* read the source */
    if (fscanf(file, format, name) != 1)
      E_FATAL("Bad CFG production rule\n");

    src = s3_cfg_str2id(cfg, name);
    if (src == S3_CFG_INVALID_ID)
      E_FATAL("Bad CFG production rule\n");

    if (s3_cfg_is_terminal(src))
      E_FATAL("Bad CFG production rule\n");
    
    if (fscanf(file, "%d", &len) != 1)
      E_FATAL("Bad CFG production rule\n");
    
    if (len > S3_CFG_MAX_ITEM_COUNT)
      E_FATAL("CFG Production rule too long\n");

    /* read the products */
    for (i = 0; i < len; i++) {
      if (fscanf(file, format, name) != 1)
        E_FATAL("Bad CFG production rule\n");

      item = s3_cfg_str2id(cfg, name);
      if (item == S3_CFG_INVALID_ID)
        E_FATAL("Bad CFG production term\n");
      products[i] = item;
    }
    products[len] = S3_CFG_EOR_ITEM;

    s3_cfg_add_rule(cfg, src, score, products);
  }

  fclose(file);

  return cfg;
}