/** * The disjunct d (whose left or right pointer points to c) is put * into the appropriate hash table * dir = 1, we're putting this into a right table. * dir = -1, we're putting this into a left table. */ static void put_into_match_table(int size, Match_node ** t, Disjunct * d, Connector * c, int dir ) { int h; Match_node * m; h = connector_hash(c) & (size-1); m = (Match_node *) xalloc (sizeof(Match_node)); m->next = NULL; m->d = d; if (dir == 1) { t[h] = add_to_right_table_list(m, t[h]); } else { t[h] = add_to_left_table_list(m, t[h]); } }
/** * Forms and returns a list of disjuncts that might match lc or rc or both. * lw and rw are the words from which lc and rc came respectively. * The list is formed by the link pointers of Match_nodes. * The list contains no duplicates. A quadratic algorithm is used to * eliminate duplicates. In practice the match_cost is less than the * parse_cost (and the loop is tiny), so there's no reason to bother * to fix this. */ Match_node * form_match_list(Sentence sent, int w, Connector *lc, int lw, Connector *rc, int rw) { Match_node *ml, *mr, *mx, *my, * mz, *front, *free_later; match_context_t *ctxt = sent->match_ctxt; if (lc != NULL) { ml = ctxt->l_table[w][connector_hash(lc) & (ctxt->l_table_size[w]-1)]; } else { ml = NULL; } if (rc != NULL) { mr = ctxt->r_table[w][connector_hash(rc) & (ctxt->r_table_size[w]-1)]; } else { mr = NULL; } front = NULL; for (mx = ml; mx != NULL; mx = mx->next) { if (mx->d->left->word < lw) break; my = get_match_node(ctxt); my->d = mx->d; my->next = front; front = my; } ml = front; /* ml is now the list of things that could match the left */ front = NULL; for (mx = mr; mx != NULL; mx = mx->next) { if (mx->d->right->word > rw) break; my = get_match_node(ctxt); my->d = mx->d; my->next = front; front = my; } mr = front; /* mr is now the list of things that could match the right */ /* now we want to eliminate duplicates from the lists */ free_later = NULL; front = NULL; for (mx = mr; mx != NULL; mx = mz) { /* see if mx in first list, put it in if its not */ mz = mx->next; ctxt->match_cost++; for (my=ml; my!=NULL; my=my->next) { ctxt->match_cost++; if (mx->d == my->d) break; } if (my != NULL) { /* mx was in the l list */ mx->next = free_later; free_later = mx; } if (my==NULL) { /* it was not there */ mx->next = front; front = mx; } } mr = front; /* mr is now the abbreviated right list */ put_match_list(sent, free_later); /* now catenate the two lists */ if (mr == NULL) return ml; for (mx = mr; mx->next != NULL; mx = mx->next) ; mx->next = ml; return mr; }
/** * This hash function only looks at the leading upper case letters of * the connector string, and the label fields. This ensures that if two * strings match (formally), then they must hash to the same place. */ static inline unsigned int hash_S(Connector * c) { unsigned int h = connector_hash(c); return (h & (CONTABSZ-1)); }
/** * Forms and returns a list of disjuncts coming from word w, that might * match lc or rc or both. The lw and rw are the words from which lc * and rc came respectively. * * The list is returned in a linked list of Match_nodes. * The list contains no duplicates. A quadratic algorithm is used to * eliminate duplicates. In practice the match_cost is less than the * parse_cost (and the loop is tiny), so there's no reason to bother * to fix this. The number of times through the loop is counted with * 'match_cost', if verbosity>1, then it this will be printed at the end. * * Well, with one exception: for long sentences that have parse * overflows, this can sometimes get match lists that are hundreds of * elements long, dominating the total time spent in the algo; viz. * in excess of 50% of the time. */ Match_node * form_match_list(fast_matcher_t *ctxt, int w, Connector *lc, int lw, Connector *rc, int rw) { size_t rlen = 0, llen = 0; Match_node *ml, *mr, *mx, *my, *mz, *front, *free_later; if (lc != NULL) { ml = ctxt->l_table[w][connector_hash(lc) & (ctxt->l_table_size[w]-1)]; } else { ml = NULL; } if (rc != NULL) { mr = ctxt->r_table[w][connector_hash(rc) & (ctxt->r_table_size[w]-1)]; } else { mr = NULL; } front = NULL; for (mx = ml; mx != NULL; mx = mx->next) { if (mx->d->left->word < lw) break; my = get_match_node(ctxt); my->d = mx->d; my->next = front; front = my; llen++; } ml = front; /* ml is now the list of things that could match the left */ front = NULL; for (mx = mr; mx != NULL; mx = mx->next) { if (mx->d->right->word > rw) break; my = get_match_node(ctxt); my->d = mx->d; my->next = front; front = my; rlen++; } mr = front; /* mr is now the list of things that could match the right */ if (mr == NULL) return ml; if (ml == NULL) return mr; /* Now we want to eliminate duplicates from the lists. */ /* If the left-lest is reasonably short, then just do a quadratic * search for duplicates. But if the list is long, optimize the * search. Based on quickie measurements, the optimized version * seems to dominate when 250 < llen and 8 < rlen. Roughly. */ if (llen < 250 || rlen < 9) { /* Perform a simple quadratic-time search. viz two nested loops. * Runtime blows up horribly for lengths over a few hundred. */ free_later = NULL; front = NULL; for (mx = mr; mx != NULL; mx = mz) { /* See if mx in first list, put it in if its not. */ mz = mx->next; ctxt->match_cost++; for (my=ml; my!=NULL; my=my->next) { ctxt->match_cost++; if (mx->d == my->d) break; } if (my != NULL) { /* mx was in the l list */ mx->next = free_later; free_later = mx; } else { /* It was not there. */ mx->next = front; front = mx; } } mr = front; /* mr is now the abbreviated right list */ put_match_list(ctxt, free_later); } else { /* Perform an O(N log N) search, by sorting first, and then * doing a linear-line run through the sorted arrays. */ size_t i,j; Match_node* mx; Match_node** mra = alloca(rlen * sizeof(Match_node*)); Match_node** mla = alloca(llen * sizeof(Match_node*)); i = 0; for (mx = mr; mx != NULL; mx = mx->next) mra[i++] = mx; qsort((void *) mra, rlen, sizeof(Match_node*), addr_compare); i = 0; for (mx = ml; mx != NULL; mx = mx->next) mla[i++] = mx; qsort((void *) mla, llen, sizeof(Match_node*), addr_compare); /* Compare addresses side-by side in a linear loop. * Be careful not to run past bounds arrays. */ free_later = NULL; front = NULL; i = 0; j = 0; while (i < rlen) { while (i < rlen && mra[i]->d < mla[j]->d) { mra[i]->next = front; front = mra[i]; i++; } if (i == rlen) break; if (mra[i]->d == mla[j]->d) { mra[i]->next = free_later; free_later = mra[i]; i++; j++; } if (i == rlen) break; while (j < llen && mra[i]->d > mla[j]->d) j++; /* Drain the rest of the right-hand list. */ if (j == llen) { while (i < rlen) { mra[i]->next = front; front = mra[i]; i++; } break; } } mr = front; /* mr is now the abbreviated right list */ put_match_list(ctxt, free_later); } /* Now catenate the two lists. */ if (mr == NULL) return ml; if (ml == NULL) return mr; for (mx = mr; mx->next != NULL; mx = mx->next) ; mx->next = ml; return mr; }