static void read_1rule(s3_cfg_t *_cfg, FILE *_file, float32 *_score, s3_cfg_id_t *_src, s3_cfg_id_t *_products) { char name[S3_CFG_MAX_ITEM_STR_LEN + 1]; float32 score; s3_cfg_id_t src; s3_cfg_id_t products[S3_CFG_MAX_ITEM_COUNT + 1]; s3_cfg_id_t item; char format[1024]; int len; int i; assert(_cfg != NULL); assert(_file != NULL); sprintf(format, "%%%ds", S3_CFG_MAX_ITEM_STR_LEN); /* read the prior */ if (fscanf(_file, "%f", &score) != 1 || score < 0) E_FATAL("Bad CFG production rule\n"); /* read the source */ if (fscanf(_file, format, name) != 1) E_FATAL("Bad CFG production rule\n"); src = s3_cfg_str2id(_cfg, name); if (src == S3_CFG_INVALID_ID) E_FATAL("Bad CFG production rule\n"); if (s3_cfg_is_terminal(src)) E_FATAL("Bad CFG production rule\n"); if (fscanf(_file, "%d", &len) != 1) E_FATAL("Bad CFG production rule\n"); if (len > S3_CFG_MAX_ITEM_COUNT) E_FATAL("CFG Production rule too long\n"); /* read the products */ for (i = 0; i < len; i++) { if (fscanf(_file, format, name) != 1) E_FATAL("Bad CFG production rule\n"); item = s3_cfg_str2id(_cfg, name); if (item == S3_CFG_INVALID_ID) E_FATAL("Bad CFG production term\n"); products[i] = item; } products[len] = S3_CFG_EOR_ITEM; *_src = src; *_score = score; memcpy(_products, products, (len + 1) * sizeof(s3_cfg_id_t)); }
void s3_cfg_compile_rules(s3_cfg_t *_cfg, logmath_t *logmath) { s3_cfg_item_t *item = NULL; s3_arraylist_t *arraylist = NULL; int i, n; assert(_cfg != NULL); arraylist = &_cfg->item_info; n = s3_arraylist_count(arraylist); for (i = n - 1; i >= 0; i--) { item = s3_arraylist_get(arraylist, i); if (!s3_cfg_is_terminal(item->id)) compile_nonterm(_cfg, item, logmath); } _cfg->predictions = (int8 *)ckd_calloc(n, sizeof(int8)); }
static void convert_cfg_rule(s3_cfg_t *_cfg, s2_fsg_t *_fsg, s3_cfg_rule_t *_rule, int _src, int _dest, int *_expansions, param_t *_params) { int index; int i, j, n; int cur, u, v; s3_cfg_id_t id; s3_cfg_item_t *item; s3_cfg_rule_t *rule; s2_fsg_trans_t *trans; cur = _src; /* Check whether the target rule has any variables that exceeded the * expansion count */ for (i = 0; i < _rule->len; i++) { id = _rule->products[i]; if (_expansions[s3_cfg_id2index(id)] > S3_CFG_MAX_FSG_EXPANSION) return; } /* Iterate through the production variables. */ for (i = 0; i < _rule->len; i++) { id = _rule->products[i]; /* For each terminal: * 1. Create a new state. * 2. Add a single definite transition from the current state to the * new state that emits the terminal. * 3. Use the new state as the current state. */ if (s3_cfg_is_terminal(id)) { trans = (s2_fsg_trans_t*)ckd_calloc(1, sizeof(s2_fsg_trans_t)); trans->from_state = cur; trans->to_state = _fsg->n_state++; trans->prob = 1.0; trans->word = (char *)ckd_salloc(s3_cfg_id2str(_cfg, id)); trans->next = _fsg->trans_list; _fsg->trans_list = trans; cur = _fsg->n_state; } /* For each non-terminal X: * 1. Create a new destination state, v. * 2. Increment expansion count for X. * 3. For each cfg rule with X as source: * a. Create a new source state, u. * b. Convert the rule with u as src and v as dest. * d. Create a new epsilon transition from the current state to u * with the rule's expansion probability. * 4. Set the current state to v. * 5. Decrement expansion count for X. */ else { index = s3_cfg_id2index(id); v = _fsg->n_state++; _expansions[index]++; item = (s3_cfg_item_t *)s3u_arraylist_get(&_cfg->item_info, index); n = s3u_arraylist_count(&item->rules); for (j = 0; j < n; j++) { rule = (s3_cfg_rule_t *)s3u_arraylist_get(&item->rules, j); u = _fsg->n_state++; convert_cfg_rule(_cfg, _fsg, rule, u, v, _expansions, _params); trans = (s2_fsg_trans_t*)ckd_calloc(1, sizeof(s2_fsg_trans_t)); trans->from_state = cur; trans->to_state = u; trans->prob = rule->prob_score; trans->word = NULL; trans->next = _fsg->trans_list; _fsg->trans_list = trans; } cur = v; _expansions[index]--; } } }
/* Attempt to convert a CFG to a FSG. No heuristic simplifcation is performed * here. The conversion will only take place if all expansion rule in the * CFG takes one of the following form * * X -> w0 w1 ... wN Y * X -> w0 w1 ... wN * X -> Y * X -> w0 * X -> nil * * where X, Y are non-terminals, and w0, ..., wN are terminals. If the * conversion is not possible, _fsg is set to NULL and the function returns -1. */ void s3_cfg_convert_to_fsg(s3_cfg_t *_cfg,s2_fsg_t **_fsg) { hash_table_t *item2state=NULL; int num_states=1; /* let 0 be the end state */ int start_state=0; int from_state; int to_state; int i=0,j=0; s3_cfg_id_t id; s3u_vector_t *items=NULL; s3u_vector_t *rules=NULL; s3_cfg_item_t *item=NULL; s3_cfg_rule_t *rule=NULL; s2_fsg_t *fsg=NULL; char *word; item2state=hash_new(S3_CFG_NAME_HASH_SIZE,HASH_CASE_YES); if (item2state==NULL) goto cleanup; fsg=(s2_fsg_t*)ckd_calloc(1,sizeof(s2_fsg_t)); fsg->name=NULL; fsg->trans_list=NULL; items=&_cfg->item_info; for (i=s3u_vec_count(items)-1;i>=0;i--) { if ((item=s3u_vec_get(items,i))==NULL) goto cleanup; rules=&item->rules; if (!s3_cfg_is_terminal(item->id) && (item->nil_rule!=NULL || (rules!=NULL && s3u_vec_count(rules)>0))) hash_enter_bkey(item2state,&item->id,sizeof(s3_cfg_id_t),num_states++); } /* iterate through the CFG's expansion rules and convert them to FSG * transitions. If at any point the conversion fails, do some cleanup * and return. */ rules=&_cfg->rules; for (i=s3u_vec_count(rules)-1;i>=0;i--) { if ((rule=s3u_vec_get(rules,i))==NULL) goto cleanup; hash_lookup_bkey(item2state,&rule->src,sizeof(s3_cfg_id_t),&from_state); /* a NULL production rule means we transition to the end state */ if (rule->len==0) add_trans(fsg,from_state,0,rule->prob_score,NULL); else if (rule->len==1) { id=rule->products[0]; /* a single terminal means we output the terminal and transition to * the end state */ if (s3_cfg_is_terminal(id)) { word=((s3_cfg_item_t*)s3u_vec_get(items,s3_cfg_id2index(id)))->name; add_trans(fsg,from_state,0,rule->prob_score,word); } /* a single non-terminal means we take an epsilon transition */ else { hash_lookup_bkey(item2state,&id,sizeof(s3_cfg_id_t),&to_state); add_trans(fsg,from_state,to_state,rule->prob_score,NULL); } } else { for (j=1;j<rule->len;j++) { /* get the output for the transition */ id=rule->products[j-1]; if (!s3_cfg_is_terminal(id)) goto cleanup; word=((s3_cfg_item_t*)s3u_vec_get(items,s3_cfg_id2index(id)))->name; /* get the target state for the transition */ id=rule->products[j]; if (s3_cfg_is_terminal(id)) to_state=num_states++; else hash_lookup_bkey(item2state,&id,sizeof(s3_cfg_id_t),&to_state); add_trans(fsg,from_state,to_state,j==1?rule->prob_score:1.0,word); from_state=to_state; } } } *_fsg=fsg; return 0; cleanup: if (fsg!=NULL) free_fsg(fsg); return -1; }
static void eval_state(s3_cfg_t *_cfg, s3_cfg_state_t *_state) { s3_cfg_rule_t *rule = NULL; s3_cfg_entry_t *entry = NULL; s3_cfg_entry_t *cmplt_entry = NULL; s3_cfg_state_t *target_state = NULL; s3_cfg_state_t *origin_state = NULL; s3_cfg_item_t *item = NULL; s3_cfg_id_t scan; s3_arraylist_t *arraylist = NULL; int8 *predictions = NULL; int32 score; int index; int dot; int i, j; assert(_cfg != NULL); assert(_state != NULL); if (_state->back != NULL) { _state->back->num_expanded++; } _state->num_expanded = 0; predictions = _cfg->predictions; memset(predictions, 0, _cfg->item_info.count * sizeof(int8)); /* iterate thru the entries in the state and perform prediction, scan, * and completion steps */ for (i = 0; i < _state->entries.count; i++) { entry = (s3_cfg_entry_t *)s3_arraylist_get(&_state->entries, i); rule = entry->rule; dot = entry->dot; origin_state = entry->origin; score = entry->score; scan = rule->products[dot]; index = s3_cfg_id2index(scan); DEBUG_ENTRY(entry); item = (s3_cfg_item_t *)s3_arraylist_get(&_cfg->item_info, index); /* saving some scores */ if (_state->best_overall_entry == NULL || score < _state->best_overall_entry->score) _state->best_overall_entry = entry; if (_state->best_overall_parse == NULL || score < _state->best_overall_parse->score) _state->best_overall_parse = entry; if (s3_cfg_is_terminal(scan)) { /************************************************************************ * NORMAL COMPLETION * * When we encounter an entry of the form * * $X -> (A * #EOR#, s0, i), * * we look for any entry in state S(i) of the form * * $Z -> (A * $X B #EOR#, s1, j) * * and add the entry * * $Z -> (A $X * B #EOR#, s1 + s2, j) * * to the current state. We also need to keep a record of which * subparses were used to complete entries. In this case, we need to * remember that this particular completed entry of $X is used to * advance the parsing of $Z. In this case, the pointer p1 is added to * the entry * * $Z -> (A $X(p1) * B #EOR#, s1 + s2, j) * * for records keeping sake. */ if (scan == S3_CFG_EOR_ITEM) { scan = entry->rule->src; arraylist = &entry->origin->entries; for (j = s3_arraylist_count(arraylist) - 1; j >= 0; j--) { cmplt_entry = (s3_cfg_entry_t *)s3_arraylist_get(arraylist, j); if (cmplt_entry->rule->products[cmplt_entry->dot] == scan) add_entry(_state, cmplt_entry->rule, cmplt_entry->dot + 1, cmplt_entry->origin, cmplt_entry->score + entry->score, cmplt_entry, entry); } } /************************************************************************ * PARSE COMPLETION * * We encountered an entry of the form * * ($PSTART -> $START * #EOI#, s i). * * Instead of waiting for an input symbol #EOI# and completing the * pseudo-start rule in the next state, we finish the parse here and save * us a step. We do need to check against other completed parses in this * state, since only the parse with the highest score is kept. */ else if (scan == S3_CFG_EOI_ITEM) { if (_state->best_completed_entry == NULL || score < _state->best_completed_entry->score) _state->best_completed_entry = entry; if (_state->best_completed_parse == NULL || score < _state->best_completed_parse->score) _state->best_completed_parse = entry; } /************************************************************************ * NORNAL SCANNING * * When we encounter an entry of the form * * ($X -> A * y B #EOR#, s, i), * * and the input symbol/terminal is y, we add to the next state the entry * * ($X -> A y * B #EOR#, s, i) */ else { index = s3_cfg_id2index(scan); arraylist = &_state->expansions; target_state = (s3_cfg_state_t *)s3_arraylist_get(arraylist, index); if (target_state == NULL) target_state = add_state(_cfg, _state, scan); add_entry(target_state, rule, dot + 1, origin_state, score, entry, NULL); } } else { /************************************************************************ * AUTOMATIC COMPLETION OF EPSILON PRODUCING NON-TERMINALS * * When we encounter an entry of the form * * ($X -> A * $Y B #EOR#, s0, i), * * we check whether $Y is a epsilon producing non-terminal, i.e., * whether the rule * * $Y -> #EOR# * * exists. If that is the case, we do not add any entry corresponding to * such epsilon producing rule. Instead, we take a short-cut by add the * following entry to the current state * * ($X -> A $Y(null) * B #EOR#, s0 + s1, i). * * Note in this new entry, the completed non-terminal $Y has a NULL sub- * parse pointer. */ if (item->nil_rule != NULL) add_entry(_state, rule, dot + 1, origin_state, score + item->nil_rule->log_score, entry, NULL); /************************************************************************ * NORMAL PREDICTION * * When we encounter an entry of the form * * ($X -> A * $Y B #EOR#, s0, i), * * we want to expand the non-terminal $Y. That is, we add an entry for * each rule that has $Y on its left-hand side. However, we don't want * to keep repeated copies of the same entries, so we keep track of which * non-terminals we've already expanded in a table. */ if (!predictions[index]) { predictions[index] = 1; arraylist = &item->rules; for (j = s3_arraylist_count(arraylist) - 1; j >= 0; j--) { rule = (s3_cfg_rule_t *)s3_arraylist_get(arraylist, j); if (rule->products[0] != S3_CFG_EOR_ITEM) add_entry(_state, rule, 0, _state, rule->log_score, NULL, NULL); } } } } }
s3_cfg_t * s3_cfg_read_simple(const char *_fn) { s3_cfg_t *cfg = NULL; FILE *file = NULL; s3_cfg_id_t src; s3_cfg_id_t products[S3_CFG_MAX_ITEM_COUNT + 1]; s3_cfg_id_t item; char name[S3_CFG_MAX_ITEM_STR_LEN + 1]; char format[1024]; float32 score; int len; int i; assert(_fn != NULL); cfg = (s3_cfg_t *)ckd_calloc(1, sizeof(s3_cfg_t)); s3_cfg_init(cfg); if ((file = fopen(_fn, "r")) == NULL) E_FATAL("Cannot open input plain cfg file"); sprintf(format, "%%%ds", S3_CFG_MAX_ITEM_STR_LEN); while (!feof(file)) { /* read the prior */ if (fscanf(file, "%f", &score) != 1 || score < 0) break; /* read the source */ if (fscanf(file, format, name) != 1) E_FATAL("Bad CFG production rule\n"); src = s3_cfg_str2id(cfg, name); if (src == S3_CFG_INVALID_ID) E_FATAL("Bad CFG production rule\n"); if (s3_cfg_is_terminal(src)) E_FATAL("Bad CFG production rule\n"); if (fscanf(file, "%d", &len) != 1) E_FATAL("Bad CFG production rule\n"); if (len > S3_CFG_MAX_ITEM_COUNT) E_FATAL("CFG Production rule too long\n"); /* read the products */ for (i = 0; i < len; i++) { if (fscanf(file, format, name) != 1) E_FATAL("Bad CFG production rule\n"); item = s3_cfg_str2id(cfg, name); if (item == S3_CFG_INVALID_ID) E_FATAL("Bad CFG production term\n"); products[i] = item; } products[len] = S3_CFG_EOR_ITEM; s3_cfg_add_rule(cfg, src, score, products); } fclose(file); return cfg; }