int word_has_connector(Dict_node * dn, char * cs, int direction) { /* This function takes a dict_node (corresponding to an entry in a given dictionary), a string (representing a connector), and a direction (0 = right-pointing, 1 = left-pointing); it returns 1 if the dictionary expression for the word includes the connector, 0 otherwise. This can be used to see if a word is in a certain category (checking for a category connector in a table), or to see if a word has a connector in a normal dictionary. The connector check uses a "smart-match", the same kind used by the parser. */ Connector * c2=NULL; Disjunct * d, *d0; if(dn == NULL) return -1; d0 = d = build_disjuncts_for_dict_node(dn); if(d == NULL) return 0; for(; d!=NULL; d=d->next) { if(direction==0) c2 = d->right; if(direction==1) c2 = d->left; for(; c2!=NULL; c2=c2->next) { if(easy_match(c2->string, cs)==1) { free_disjuncts(d0); return 1; } } } free_disjuncts(d0); return 0; }
/** * dict_display_word_info() - display the information about the given word. */ void dict_display_word_info(Dictionary dict, const char * s) { Dict_node *dn, *dn_head; Disjunct * d1, * d2; int len; dn_head = dictionary_lookup_list(dict, s); if (dn_head == NULL) { printf(" \"%s\" matches nothing in the dictionary.\n", s); return; } printf("Matches:\n"); for (dn = dn_head; dn != NULL; dn = dn->right) { len = 0; d1 = build_disjuncts_for_dict_node(dn); for(d2 = d1 ; d2 != NULL; d2 = d2->next) { len++; } free_disjuncts(d1); printf(" "); left_print_string(stdout, dn->string, " "); printf(" %5d disjuncts ", len); if (dn->file != NULL) { printf("<%s>", dn->file->file); } printf("\n"); } free_lookup_list(dn_head); return; }
/** * Pack all disjunct and connectors into one big memory block. * This facilitate a better memory caching for long sentences * (a performance gain of a few percents). * * The current Connector struct size is 32 bit, and future ones may be * smaller, but still with a power-of-2 size. * The idea is to put an integral number of connectors in each cache line * (assumed to be >= Connector struct size, e.g. 64 bytes), * so one connector will not need 2 cache lines. * * The allocated memory includes 3 sections , in that order: * 1. A block for disjuncts, when it start is not aligned (the disjunct size * is currently 56 bytes and cannot be reduced much). * 2. A small alignment gap, that ends in a 64-byte boundary. * 3. A block of connectors, which is so aligned to 64-byte boundary. * * FIXME: 1. Find the "best" value for SHORTEST_SENTENCE_TO_PACK. * 2. Maybe this check should be done in too stages, the second one * will use number of disjunct and connector thresholds. */ static void pack_sentence(Sentence sent) { int dcnt = 0; int ccnt = 0; if (sent->length < SHORTEST_SENTENCE_TO_PACK) return; for (size_t w = 0; w < sent->length; w++) { Disjunct *d; for (d = sent->word[w].d; NULL != d; d = d->next) { dcnt++; for (Connector *c = d->right; c!=NULL; c = c->next) ccnt++; for (Connector *c = d->left; c != NULL; c = c->next) ccnt++; } } #define CONN_ALIGNMENT sizeof(Connector) size_t dsize = dcnt * sizeof(Disjunct); dsize = ALIGN(dsize, CONN_ALIGNMENT); /* Align connector block. */ size_t csize = ccnt * sizeof(Connector); void *memblock = malloc(dsize + csize); Disjunct *dblock = memblock; Connector *cblock = (Connector *)((char *)memblock + dsize); sent->disjuncts_connectors_memblock = memblock; for (size_t i = 0; i < sent->length; i++) { Disjunct *word_disjuncts = sent->word[i].d; sent->word[i].d = pack_disjuncts_dup(sent->word[i].d, &dblock, &cblock); free_disjuncts(word_disjuncts); } }
static void construct_neither(Sentence sent) { int w; if (!sentence_contains(sent, L"neither")) { /* I don't see the point removing disjuncts on "nor". I Don't know why I did this. What's the problem keeping the stuff explicitely defined for "nor" in the dictionary? --DS 3/98 */ #if 0 for (w=0; w<sent->length; w++) { if (wcscmp(sent->word[w].string, L"nor") != 0) continue; free_disjuncts(sent->word[w].d); sent->word[w].d = NULL; /* a nor with no neither is dead */ } #endif return; } for (w=0; w<sent->length; w++) { if (wcscmp(sent->word[w].string, L"neither") != 0) continue; sent->word[w].d = catenate_disjuncts( special_disjunct(NEITHER_LABEL,L'+',L"", L"neither"), sent->word[w].d); } for (w=0; w<sent->length; w++) { if (wcscmp(sent->word[w].string, L"nor") != 0) continue; sent->word[w].d = glom_aux_connector (sent->word[w].d, NEITHER_LABEL, TRUE); } }
/** * Initialize the word fields of the connectors, and * eliminate those disjuncts that are so long, that they * would need to connect past the end of the sentence. */ static void setup_connectors(Sentence sent) { size_t w; Disjunct * d, * xd, * head; for (w=0; w<sent->length; w++) { head = NULL; for (d=sent->word[w].d; d!=NULL; d=xd) { xd = d->next; if ((set_dist_fields(d->left, w, -1) < 0) || (set_dist_fields(d->right, w, 1) >= (int) sent->length)) { d->next = NULL; free_disjuncts(d); } else { d->next = head; head = d; } } sent->word[w].d = head; } }
void free_sentence_disjuncts(Sentence sent) { int i; for (i=0; i<sent->length; ++i) { free_disjuncts(sent->word[i].d); sent->word[i].d = NULL; } if (sentence_contains_conjunction(sent)) free_AND_tables(sent); }
static void free_sentence_disjuncts(Sentence sent) { size_t i; for (i = 0; i < sent->length; ++i) { free_disjuncts(sent->word[i].d); sent->word[i].d = NULL; } }
/** * Free all the connectors and disjuncts of a specific linkage. */ void free_linkage_connectors_and_disjuncts(Linkage lkg) { // Free the connectors for(size_t i = 0; i < lkg->num_links; i++) { free(lkg->link_array[i].rc); free(lkg->link_array[i].lc); } // Free the disjuncts for (size_t i = 0; i < lkg->num_words; i++) { free_disjuncts(lkg->chosen_disjuncts[i]); } }
static void free_sentence_words(Sentence sent) { size_t i; for (i = 0; i < sent->length; i++) { free_X_nodes(sent->word[i].x); free_disjuncts(sent->word[i].d); free(sent->word[i].alternatives); } free((void *) sent->word); sent->word = NULL; }
void delete_unmarked_disjuncts(Sentence sent) { int w; Disjunct *d_head, *d, *dx; for (w=0; w<sent->length; w++) { d_head = NULL; for (d=sent->word[w].d; d != NULL; d=dx) { dx = d->next; if (d->marked) { d->next = d_head; d_head = d; } else { d->next = NULL; free_disjuncts(d); } } sent->word[w].d = d_head; } }
/** * Takes the list of disjuncts pointed to by d, eliminates all * duplicates, and returns a pointer to a new list. * It frees the disjuncts that are eliminated. */ Disjunct * eliminate_duplicate_disjuncts(Disjunct * d) { int i, h, count; Disjunct *dn, *dx, *dxn, *front; count = 0; disjunct_dup_table *dt; dt = disjunct_dup_table_new(next_power_of_two_up(2 * count_disjuncts(d))); for (;d!=NULL; d = dn) { dn = d->next; h = hash_disjunct(d); front = NULL; for (dx = dt->dup_table[h]; dx != NULL; dx = dxn) { dxn = dx->next; if (disjunct_matches_alam(dx,d)) { /* we know that d should be killed */ d->next = NULL; free_disjuncts(d); count++; front = catenate_disjuncts(front, dx); break; } else if (disjunct_matches_alam(d,dx)) { /* we know that dx should be killed off */ dx->next = NULL; free_disjuncts(dx); count++; } else { /* neither should be killed off */ dx->next = front; front = dx; } } if (dx == NULL) { /* we put d in the table */ d->next = front; front = d; } dt->dup_table[h] = front; } /* d is now NULL */ for (i = 0; i < dt->dup_table_size; i++) { for (dx = dt->dup_table[i]; dx != NULL; dx = dxn) { dxn = dx->next; dx->next = d; d = dx; } } if ((verbosity > 2) && (count != 0)) printf("killed %d duplicates\n", count); disjunct_dup_table_delete(dt); return d; }
/** * classic_parse() -- parse the given sentence. * Perform parsing, using the original link-grammar parsing algorithm * given in the original link-grammar papers. * * Do the parse with the minimum number of null-links within the range * specified by opts->min_null_count and opts->max_null_count. * * To that end, call do_parse() with an increasing null_count, from * opts->min_null_count up to (including) opts->max_null_count, until a * parse is found. * * A note about the disjuncts save/restore that is done here: * To increase the parsing speed, before invoking do_parse(), * pp_and_power_prune() is invoked to remove connectors which have no * possibility to connect. It includes a significant optimization when * null_count==0 that makes a more aggressive removal, but this * optimization is not appropriate when null_count>0. * * So in case this optimization has been done and a complete parse (i.e. * a parse when null_count==0) is not found, we are left with sentence * disjuncts which are not appropriate to continue do_parse() tries with * null_count>0. To solve that, we need to restore the original * disjuncts of the sentence and call pp_and_power_prune() once again. */ void classic_parse(Sentence sent, Parse_Options opts) { fast_matcher_t * mchxt = NULL; count_context_t * ctxt = NULL; bool pp_and_power_prune_done = false; Disjunct **disjuncts_copy = NULL; bool is_null_count_0 = (0 == opts->min_null_count); int max_null_count = MIN((int)sent->length, opts->max_null_count); /* Build lists of disjuncts */ prepare_to_parse(sent, opts); if (resources_exhausted(opts->resources)) return; if (is_null_count_0 && (0 < max_null_count)) { /* Save the disjuncts in case we need to parse with null_count>0. */ disjuncts_copy = alloca(sent->length * sizeof(Disjunct *)); for (size_t i = 0; i < sent->length; i++) disjuncts_copy[i] = disjuncts_dup(sent->word[i].d); } for (int nl = opts->min_null_count; nl <= max_null_count; nl++) { Count_bin hist; s64 total; if (!pp_and_power_prune_done) { if (0 != nl) { pp_and_power_prune_done = true; if (is_null_count_0) opts->min_null_count = 1; /* Don't optimize for null_count==0. */ /* We are parsing now with null_count>0, when previously we * parsed with null_count==0. Restore the save disjuncts. */ if (NULL != disjuncts_copy) { free_sentence_disjuncts(sent); for (size_t i = 0; i < sent->length; i++) sent->word[i].d = disjuncts_copy[i]; disjuncts_copy = NULL; } } pp_and_power_prune(sent, opts); if (is_null_count_0) opts->min_null_count = 0; if (resources_exhausted(opts->resources)) break; free_count_context(ctxt, sent); free_fast_matcher(sent, mchxt); pack_sentence(sent); ctxt = alloc_count_context(sent); mchxt = alloc_fast_matcher(sent); print_time(opts, "Initialized fast matcher"); } if (resources_exhausted(opts->resources)) break; free_linkages(sent); sent->null_count = nl; hist = do_parse(sent, mchxt, ctxt, sent->null_count, opts); total = hist_total(&hist); lgdebug(D_PARSE, "Info: Total count with %zu null links: %lld\n", sent->null_count, total); /* total is 64-bit, num_linkages_found is 32-bit. Clamp */ total = (total > INT_MAX) ? INT_MAX : total; total = (total < 0) ? INT_MAX : total; sent->num_linkages_found = (int) total; print_time(opts, "Counted parses"); extractor_t * pex = extractor_new(sent->length, sent->rand_state); bool ovfl = setup_linkages(sent, pex, mchxt, ctxt, opts); process_linkages(sent, pex, ovfl, opts); free_extractor(pex); post_process_lkgs(sent, opts); if (sent->num_valid_linkages > 0) break; if ((0 == nl) && (0 < max_null_count) && verbosity > 0) prt_error("No complete linkages found.\n"); /* If we are here, then no valid linkages were found. * If there was a parse overflow, give up now. */ if (PARSE_NUM_OVERFLOW < total) break; //if (sent->num_linkages_found > 0 && nl>0) printf("NUM_LINKAGES_FOUND %d\n", sent->num_linkages_found); } sort_linkages(sent, opts); if (NULL != disjuncts_copy) { for (size_t i = 0; i < sent->length; i++) free_disjuncts(disjuncts_copy[i]); } free_count_context(ctxt, sent); free_fast_matcher(sent, mchxt); }
int position_words(Dictionary dict, Alink * alink, int currentword, int direction, double leftend, double rightend) { /* direction: left = 0, right = 1 */ Disjunct * d, * d0; Connector * c; Dict_node * dn; wchar_t * s; wchar_t * ds; int numcon, i, n, w, ok; int linkage_found = 1; Link link; wchar_t * ws; Alink * al; double position; double range; double newleftend, newrightend; /* Right now it goes through and choose the disjunct twice - once for the right and once for the left. This seems unnecessary... */ ds = tword[currentword].gstring; if(localv == 2) wprintf_s(L" Tracing word '%s', word %d, direction %d\n", ds, currentword, direction); /* With a conjunction: you could submit a string from one of the andlist element words here as ds, instead of the conjunction itself - "currentword" would still be the conjunction, though. (But what about the XL and XR connectors?) */ dn = dictionary_lookup(dict, ds); /* Should we go through the dict_nodes here, or is it okay to just take the first one? */ d0 = d = build_disjuncts_for_dict_node(dn); for(; d!=NULL; d=d->next) { ok = evaluate_disjunct(d, alink, currentword); if(ok==1) break; } if(d == NULL) { if(localv == 2) wprintf_s(L"No disjunct found for word '%s'\n", ds); free_disjuncts(d0); return 0; } /* We've found a disjunct to use for the current word. Now we go through all the connectors on the disjunct; for each one, we look through the links to find a link of the right type with the current word on one end; then we position the word on the other end and repeat this process recursively */ range=rightend-leftend; if(direction==0) c = d->left; if(direction==1) c = d->right; numcon = 5; /* A better way: numcon = 0; if(direction == 0) { for(; c!=NULL; c=c->next) numcon++; } */ n=1; ok = 1; while(c!=NULL) { /* for(n=1; n<=numcon; n++) { */ s = c->string; if(localv == 2) wprintf_s(L" String from disjunct for '%s': %s\n", ds, s); for(al = alink; al!=NULL; al=al->next) { if(al->ignore == 1) continue; if(direction==0 && al->rightsub == currentword) { /* Does the link have the current word on the right end? */ if (wcscmp(s, L"XR")==0) continue; if (easy_match (s, al->connector) == 1 && word_position[al->leftsub]==-1.0) { ws=al->left; position = rightend - (range * ((numcon+1.0 - n) / (numcon+1.0))); word_position[al->leftsub] = position; if(localv == 2) wprintf_s(L" Word '%s' has position %6.6f\n", ws, position); newleftend = ( position + (rightend - (range * ((numcon+1.0 - (n-1.0)) / (numcon+1.0)))) ) / 2.0; newrightend = ( position + (rightend - (range * ((numcon+1.0 - (n+1.0)) / (numcon+1.0)))) ) / 2.0; /* wprintf_s("Newleftend = %6.6f, newrightend = %6.6f\n", newleftend, newrightend); */ linkage_found = position_words(dict, alink, al->leftsub, 0, newleftend, position); if(linkage_found==0) ok = 0; linkage_found = position_words(dict, alink, al->leftsub, 1, position, newrightend); if(linkage_found==0) ok = 0; n++; } } if(direction==1 && al->leftsub == currentword) { if (wcscmp(s, L"XL")==0) continue; if (easy_match (s, al->connector) == 1 && word_position[al->rightsub]==-1.0) { ws = al->right; position = leftend + (range * ((numcon+1.0 - n) / (numcon+1.0))); word_position[al->rightsub] = position; if(localv == 2) wprintf_s(L" Word '%s' has position %6.6f\n", ws, position); newrightend = ( position + (leftend + (range * ((numcon+1.0 - (n-1.0)) / (numcon+1.0)))) ) / 2.0; newleftend = ( position + (leftend + (range * ((numcon+1.0 - (n+1.0)) / (numcon+1.0)))) ) / 2.0; /* wprintf_s("Newleftend = %6.6f, newrightend = %6.6f\n", newleftend, newrightend); */ linkage_found = position_words(dict, alink, al->rightsub, 0, newleftend, position); if(linkage_found==0) ok = 0; linkage_found = position_words(dict, alink, al->rightsub, 1, position, newrightend); if(linkage_found==0) ok = 0; n++; } } } c = c->next; } free_disjuncts(d0); if(ok==1) return 1; else return 0; }