void init_fast_matcher(Sentence sent) { int w, len, size, i; Match_node ** t; Disjunct * d; match_cost = 0; for (w=0; w<sent->length; w++) { len = left_disjunct_list_length(sent->word[w].d); size = next_power_of_two_up(len); l_table_size[w] = size; t = l_table[w] = (Match_node **) xalloc(size * sizeof(Match_node *)); for (i=0; i<size; i++) t[i] = NULL; for (d=sent->word[w].d; d!=NULL; d=d->next) { if (d->left != NULL) { put_into_match_table(size, t, d, d->left, -1); } } len = right_disjunct_list_length(sent->word[w].d); size = next_power_of_two_up(len); r_table_size[w] = size; t = r_table[w] = (Match_node **) xalloc(size * sizeof(Match_node *)); for (i=0; i<size; i++) t[i] = NULL; for (d=sent->word[w].d; d!=NULL; d=d->next) { if (d->right != NULL) { put_into_match_table(size, t, d, d->right, 1); } } } }
fast_matcher_t* alloc_fast_matcher(const Sentence sent) { unsigned int size; size_t w; int len; Match_node ** t; Disjunct * d; fast_matcher_t *ctxt; ctxt = (fast_matcher_t *) xalloc(sizeof(fast_matcher_t)); ctxt->size = sent->length; ctxt->l_table_size = xalloc(2 * sent->length * sizeof(unsigned int)); ctxt->r_table_size = ctxt->l_table_size + sent->length; ctxt->l_table = xalloc(2 * sent->length * sizeof(Match_node **)); ctxt->r_table = ctxt->l_table + sent->length; memset(ctxt->l_table, 0, 2 * sent->length * sizeof(Match_node **)); ctxt->match_cost = 0; ctxt->mn_free_list = NULL; for (w=0; w<sent->length; w++) { len = left_disjunct_list_length(sent->word[w].d); size = next_power_of_two_up(len); ctxt->l_table_size[w] = size; t = ctxt->l_table[w] = (Match_node **) xalloc(size * sizeof(Match_node *)); memset(t, 0, size * sizeof(Match_node *)); for (d = sent->word[w].d; d != NULL; d = d->next) { if (d->left != NULL) { put_into_match_table(size, t, d, d->left, -1); } } len = right_disjunct_list_length(sent->word[w].d); size = next_power_of_two_up(len); ctxt->r_table_size[w] = size; t = ctxt->r_table[w] = (Match_node **) xalloc(size * sizeof(Match_node *)); memset(t, 0, size * sizeof(Match_node *)); for (d = sent->word[w].d; d != NULL; d = d->next) { if (d->right != NULL) { put_into_match_table(size, t, d, d->right, 1); } } } return ctxt; }
Connector_set * connector_set_create(Exp *e) { int i; Connector_set *conset; conset = (Connector_set *) xalloc(sizeof(Connector_set)); conset->table_size = next_power_of_two_up(size_of_expression(e)); conset->hash_table = (Connector **) xalloc(conset->table_size * sizeof(Connector *)); for (i=0; i<conset->table_size; i++) conset->hash_table[i] = NULL; build_connector_set_from_expression(conset, e); return conset; }
/** * Align given size to the nearest upper power of 2 * for size<MAX_ALIGNMENT, else to MIN_ALIGNMENT. */ static size_t align_size(size_t element_size) { if (element_size < MAX_ALIGNMENT) { size_t s = next_power_of_two_up(element_size); if (s != element_size) element_size = ALIGN(element_size, s); } else { element_size = ALIGN(element_size, MIN_ALIGNMENT); } return element_size; }
/** * Takes the list of disjuncts pointed to by d, eliminates all * duplicates, and returns a pointer to a new list. * It frees the disjuncts that are eliminated. */ Disjunct * eliminate_duplicate_disjuncts(Disjunct * d) { int i, h, count; Disjunct *dn, *dx, *dxn, *front; count = 0; disjunct_dup_table *dt; dt = disjunct_dup_table_new(next_power_of_two_up(2 * count_disjuncts(d))); for (;d!=NULL; d = dn) { dn = d->next; h = hash_disjunct(d); front = NULL; for (dx = dt->dup_table[h]; dx != NULL; dx = dxn) { dxn = dx->next; if (disjunct_matches_alam(dx,d)) { /* we know that d should be killed */ d->next = NULL; free_disjuncts(d); count++; front = catenate_disjuncts(front, dx); break; } else if (disjunct_matches_alam(d,dx)) { /* we know that dx should be killed off */ dx->next = NULL; free_disjuncts(dx); count++; } else { /* neither should be killed off */ dx->next = front; front = dx; } } if (dx == NULL) { /* we put d in the table */ d->next = front; front = d; } dt->dup_table[h] = front; } /* d is now NULL */ for (i = 0; i < dt->dup_table_size; i++) { for (dx = dt->dup_table[i]; dx != NULL; dx = dxn) { dxn = dx->next; dx->next = d; d = dx; } } if ((verbosity > 2) && (count != 0)) printf("killed %d duplicates\n", count); disjunct_dup_table_delete(dt); return d; }
/** * Allocates and builds the initial power hash tables. * Each word has 2 tables - for its left and right connectors. * In these tables, the connectors are hashed according to their * uppercase part. * In each hash slot, the shallow connectors appear first, so when * matching deep connectors to the connectors in a slot, the * match loop can stop when there are no more shallow connectors in that * slot (since if both are deep, they cannot be matched). * * The suffix_id of each connector serves as its reference count. * Hence, it should always be > 0. * * There are two code paths for initializing the power tables: * 1. When disjunct-jets sharing is not done. The words then are * directly scanned for their disjuncts and connectors. Each ones * is inserted with a reference count (as suffix_id) set to 1. * 2. Using the disjunct-jet tables (left and right). Each slot * contains only a pointer to a disjunct-jet. The word number is * extracted from the deepest connector (that has been assigned to it by * setup_connectors()). * * FIXME: Find a way to not use a reference count (to increase * efficiency). */ static void power_table_init(Sentence sent, power_table *pt) { unsigned int i; #define TOPSZ 32768 size_t lr_table_max_usage = MIN(sent->dict->contable.num_con, TOPSZ); Pool_desc *mp = pt->memory_pool = pool_new(__func__, "C_list", /*num_elements*/2048, sizeof(C_list), /*zero_out*/false, /*align*/false, /*exact*/false); for (WordIdx w = 0; w < sent->length; w++) { size_t l_size, r_size; C_list **l_t, **r_t; size_t len; /* The below uses variable-sized hash tables. This seems to * provide performance that is equal or better than the best * fixed-size performance. * The best fixed-size performance seems to come at about * a 1K table size, for both English and Russian. (Both have * about 100 fixed link-types, and many thousands of auto-genned * link types (IDxxx idioms for both, LLxxx suffix links for * Russian). Pluses and minuses: * + small fixed tables are faster to initialize. * - small fixed tables have more collisions * - variable-size tables require counting connectors. * (and the more complex code to go with) * CPU cache-size effects ... */ if (sent->jet_sharing.num_cnctrs_per_word[0]) len = sent->jet_sharing.num_cnctrs_per_word[0][w]; else len = left_connector_count(sent->word[w].d); l_size = next_power_of_two_up(MIN(len, lr_table_max_usage)); pt->l_table_size[w] = l_size; l_t = pt->l_table[w] = (C_list **) xalloc(l_size * sizeof(C_list *)); for (i=0; i<l_size; i++) l_t[i] = NULL; if (sent->jet_sharing.num_cnctrs_per_word[1]) len = sent->jet_sharing.num_cnctrs_per_word[1][w]; else len = right_connector_count(sent->word[w].d); r_size = next_power_of_two_up(MIN(len, lr_table_max_usage)); pt->r_table_size[w] = r_size; r_t = pt->r_table[w] = (C_list **) xalloc(r_size * sizeof(C_list *)); for (i=0; i<r_size; i++) r_t[i] = NULL; if (!sent->jet_sharing.num_cnctrs_per_word[0]) { /* Insert the deep connectors. */ for (Disjunct *d = sent->word[w].d; d != NULL; d = d->next) { Connector *c; c = d->right; if (c != NULL) { c->suffix_id = 1; for (c = c->next; c != NULL; c = c->next) { c->suffix_id = 1; put_into_power_table(mp, r_size, r_t, c, false); } } c = d->left; if (c != NULL) { c->suffix_id = 1; for (c = c->next; c != NULL; c = c->next) { c->suffix_id = 1; put_into_power_table(mp, l_size, l_t, c, false); } } } /* Insert the shallow connectors. */ for (Disjunct *d = sent->word[w].d; d != NULL; d = d->next) { Connector *c; c = d->right; if (c != NULL) { put_into_power_table(mp, r_size, r_t, c, true); } c = d->left; if (c != NULL) { put_into_power_table(mp, l_size, l_t, c, true); } } } } if (sent->jet_sharing.num_cnctrs_per_word[0]) { /* Bulk insertion with reference count. Note: IDs start from 1. */ /* Insert the deep connectors. */ for (int dir = 0; dir < 2; dir++) { C_list ***tp; unsigned int *sizep; if (dir== 0) { tp = pt->l_table; sizep = pt->l_table_size; } else { tp = pt->r_table; sizep = pt->r_table_size; } for (unsigned int id = 1; id < sent->jet_sharing.entries[dir] + 1; id++) { Connector *htc = sent->jet_sharing.table[dir][id]; Connector *deepest; for (deepest = htc; NULL != deepest->next; deepest = deepest->next) ; int w = deepest->nearest_word + ((dir== 0) ? 1 : -1); unsigned int size = sizep[w]; C_list **t = tp[w]; int suffix_id = htc->suffix_id; for (Connector *c = htc->next; NULL != c; c = c->next) { c->suffix_id = suffix_id; put_into_power_table(mp, size, t, c, false); } } /* Insert the shallow connectors. */ for (unsigned int id = 1; id < sent->jet_sharing.entries[dir] + 1; id++) { Connector *htc = sent->jet_sharing.table[dir][id]; Connector *deepest; for (deepest = htc; NULL != deepest->next; deepest = deepest->next) ; int w = deepest->nearest_word + ((dir == 0) ? 1 : -1); unsigned int size = sizep[w]; C_list **t = tp[w]; put_into_power_table(mp, size, t, htc, true); } } } }