/** * Add a null word node that represents two or more null morphemes. * Used for "unifying" null morphemes that are part of a single subword, * when only some of its morphemes (2 or more) don't have a linkage. * The words "start" to "end" (including) are unified by the new node. */ static Gword *wordgraph_null_join(Sentence sent, Gword **start, Gword **end) { Gword *new_word; Gword **w; char *usubword; size_t join_len = 0; for (w = start; w <= end; w++) join_len += strlen((*w)->subword); usubword = calloc(join_len+1, 1); /* zeroed out */ for (w = start; w <= end; w++) add_morpheme_unmarked(sent, usubword, (*w)->subword, (*w)->morpheme_type); new_word = gword_new(sent, usubword); free(usubword); new_word->status |= WS_PL; new_word->label = "NJ"; new_word->null_subwords = NULL; new_word->start = (*start)->start; new_word->end = (*end)->end; /* Link the null_subwords links of the added unifying node to the null * subwords it unified. */ for (w = start; w <= end; w++) gwordlist_append(&new_word->null_subwords, (Gword *)(*w)); /* Removing const qualifier, but gwordlist_append doesn't change w->... . */ return new_word; }
/** * Construct word paths (one or more) through the Wordgraph. * * Add 'current_word" to the potential path. * Add "p" to the path queue, which defines the start of the next potential * paths to be checked. * * Each path is up to the current word (not including). It doesn't actually * construct a full path if there are null words - they break it. The final path * is constructed when the Wordgraph termination word is encountered. * * Note: The final path doesn't match the linkage word indexing if the linkage * contains empty words, at least until empty words are eliminated from the * linkage (in compute_chosen_words()). Further processing of the path is done * there in case morphology splits are to be hidden or there are morphemes with * null linkage. */ static void wordgraph_path_append(Wordgraph_pathpos **nwp, const Gword **path, Gword *current_word, /* add to the path */ Gword *p) /* add to the path queue */ { size_t n = wordgraph_pathpos_len(*nwp); assert(NULL != p, "Tried to add a NULL word to the word queue"); /* Check if the path queue already contains the word to be added to it. */ if (NULL != *nwp) { const Wordgraph_pathpos *wpt; for (wpt = *nwp; NULL != wpt->word; wpt++) { if (p == wpt->word) { /* If we are here, there are 2 or more paths leading to this word * (p) that end with the same number of consecutive null words that * consist an entire alternative. These null words represent * different ways to split the subword upward in the hierarchy, but * since they don't have linkage we don't care which of these * paths is used. */ return; /* The word is already in the queue */ } } } /* Not already in the path queue - add it. */ *nwp = wordgraph_pathpos_resize(*nwp, n); (*nwp)[n].word = p; if (MT_INFRASTRUCTURE == p->prev[0]->morpheme_type) { /* Previous word is the Wordgraph dummy word. Initialize the path. */ (*nwp)[n].path = NULL; } else { /* We branch to another path. Duplicate it from the current path and add * the current word to it. */ size_t path_arr_size = (gwordlist_len(path)+1)*sizeof(*path); (*nwp)[n].path = malloc(path_arr_size); memcpy((*nwp)[n].path, path, path_arr_size); } /* FIXME (cast) but anyway gwordlist_append() doesn't modify Gword. */ gwordlist_append((Gword ***)&(*nwp)[n].path, current_word); }
void compute_chosen_words(Sentence sent, Linkage linkage, Parse_Options opts) { WordIdx i; /* index of chosen_words */ WordIdx j; Disjunct **cdjp = linkage->chosen_disjuncts; const char **chosen_words = alloca(linkage->num_words * sizeof(*chosen_words)); int *remap = alloca(linkage->num_words * sizeof(*remap)); bool *show_word = alloca(linkage->num_words * sizeof(*show_word)); bool display_morphology = opts->display_morphology; Gword **lwg_path = linkage->wg_path; Gword **n_lwg_path = NULL; /* new Wordgraph path, to match chosen_words */ Gword **nullblock_start = NULL; /* start of a null block, to be put in [] */ size_t nbsize = 0; /* number of word in a null block */ Gword *sentence_word; memset(show_word, 0, linkage->num_words * sizeof(*show_word)); if (verbosity_level(D_CCW)) print_lwg_path(lwg_path, "Linkage"); for (i = 0; i < linkage->num_words; i++) { Disjunct *cdj = cdjp[i]; Gword *w; /* current word */ const Gword *nw; /* next word (NULL if none) */ Gword **wgp; /* wordgraph_path traversing pointer */ const char *t = NULL; /* current word string */ bool at_nullblock_end; /* current word is at end of a nullblock */ bool join_alt = false; /* morpheme-join this alternative */ char *s; size_t l; size_t m; lgdebug(D_CCW, "Loop start, word%zu: cdj %s, path %s\n", i, cdj ? cdj->word_string : "NULL", lwg_path[i] ? lwg_path[i]->subword : "NULL"); w = lwg_path[i]; nw = lwg_path[i+1]; wgp = &lwg_path[i]; sentence_word = wg_get_sentence_word(sent, w); /* FIXME If the original word was capitalized in a capitalizable * position, the displayed null word may be its downcase version. */ if (NULL == cdj) /* a null word (the chosen disjunct was NULL) */ { chosen_words[i] = NULL; nbsize++; if (NULL == nullblock_start) /* it starts a new null block */ nullblock_start = wgp; at_nullblock_end = (NULL == nw) || (wg_get_sentence_word(sent, nw->unsplit_word) != sentence_word); /* Accumulate null words in this alternative */ if (!at_nullblock_end && (NULL == cdjp[i+1]) && ((w->morpheme_type == MT_PUNC) == (nw->morpheme_type == MT_PUNC))) { lgdebug(D_CCW, "Skipping word%zu cdjp=NULL#%zu, path %s\n", i, nbsize, w->subword); chosen_words[i] = NULL; continue; } if (NULL != nullblock_start) { /* If we are here, this null word is an end of a null block */ lgdebug(+D_CCW, "Handling %zu null words at %zu: ", nbsize, i); if (1 == nbsize) { /* Case 1: A single null subword. */ lgdebug(D_CCW, "A single null subword.\n"); t = join_null_word(sent, wgp, nbsize); gwordlist_append(&n_lwg_path, w); } else { lgdebug(D_CCW, "Combining null subwords"); /* Use alternative_id to check for start of alternative. */ if (((*nullblock_start)->alternative_id == *nullblock_start) && at_nullblock_end) { /* Case 2: A null unsplit_word (all-nulls alternative).*/ lgdebug(D_CCW, " (null alternative)\n"); t = sentence_word->subword; gwordlist_append(&n_lwg_path, sentence_word); } else { /* Case 3: Join together >=2 null morphemes. */ Gword *wgnull; lgdebug(D_CCW, " (null partial word)\n"); wgnull = wordgraph_null_join(sent, wgp-nbsize+1, wgp); gwordlist_append(&n_lwg_path, wgnull); t = wgnull->subword; } } nullblock_start = NULL; nbsize = 0; show_word[i] = true; if (MT_WALL != w->morpheme_type) { /* Put brackets around the null word. */ l = strlen(t) + 2; s = (char *) alloca(l+1); s[0] = NULLWORD_START; strcpy(&s[1], t); s[l-1] = NULLWORD_END; s[l] = '\0'; t = string_set_add(s, sent->string_set); lgdebug(D_CCW, " %s\n", t); /* Null words have no links, so take care not to drop them. */ } } } else { /* This word has a linkage. */ /* TODO: Suppress "virtual-morphemes", currently the dictcap ones. */ char *sm; t = cdj->word_string; /* Print the subscript, as in "dog.n" as opposed to "dog". */ if (0) { /* TODO */ } else { /* Get rid of those ugly ".Ixx" */ if (is_idiom_word(t)) { s = strdupa(t); sm = strrchr(s, SUBSCRIPT_MARK); /* Possible double subscript. */ UNREACHABLE(NULL == sm); /* We know it has a subscript. */ *sm = '\0'; t = string_set_add(s, sent->string_set); } else if (HIDE_MORPHO) { /* Concatenate the word morphemes together into one word. * Concatenate their subscripts into one subscript. * Use subscript separator SUBSCRIPT_SEP. * XXX Check whether we can encounter an idiom word here. * FIXME Combining contracted words is not handled yet, because * combining morphemes which have non-LL links to other words is * not yet implemented. * FIXME Move to a separate function. */ Gword **wgaltp; size_t join_len = 0; size_t mcnt = 0; /* If the alternative contains morpheme subwords, mark it * for joining... */ const Gword *unsplit_word = w->unsplit_word; for (wgaltp = wgp, j = i; NULL != *wgaltp; wgaltp++, j++) { if ((*wgaltp)->unsplit_word != unsplit_word) break; if (MT_INFRASTRUCTURE == (*wgaltp)->unsplit_word->morpheme_type) break; mcnt++; if (NULL == cdjp[j]) { /* ... but not if it contains a null word */ join_alt = false; break; } join_len += strlen(cdjp[j]->word_string) + 1; if ((*wgaltp)->morpheme_type & IS_REG_MORPHEME) join_alt = true; } if (join_alt) { /* Join it in two steps: 1. Base words. 2. Subscripts. * FIXME? Can be done in one step (more efficient but maybe * less clear). * Put SUBSCRIPT_SEP between the subscripts. * XXX No 1-1 correspondence between the hidden base words * and the subscripts after the join, in case there are base * words with and without subscripts. */ const char subscript_sep_str[] = { SUBSCRIPT_SEP, '\0'}; char *join = calloc(join_len + 1, 1); /* zeroed out */ join[0] = '\0'; /* 1. Join base words. (Could just use the unsplit_word.) */ for (wgaltp = wgp, m = 0; m < mcnt; wgaltp++, m++) { add_morpheme_unmarked(sent, join, cdjp[i+m]->word_string, (*wgaltp)->morpheme_type); } strcat(join, subscript_mark_str()); /* tentative */ /* 2. Join subscripts. */ for (wgaltp = wgp, m = 0; m < mcnt; wgaltp++, m++) { /* Cannot NULLify the word - we may have links to it. */ if (m != mcnt-1) chosen_words[i+m] = ""; sm = strchr(cdjp[i+m]->word_string, SUBSCRIPT_MARK); if (NULL != sm) { /* Supposing stem subscript is .=x (x optional) */ if (MT_STEM == (*wgaltp)->morpheme_type) { sm += 1 + STEM_MARK_L; /* sm+strlen(".=") */ if ('\0' == *sm) sm = NULL; #if 0 if ((cnt-1) == m) { /* Support a prefix-stem combination. In that case * we have just nullified the combined word, so we * need to move it to the position of the prefix. * FIXME: May still not be good enough. */ move_combined_word = i+m-1; /* And the later chosen_word assignment should be: * chosen_words[-1 != move_combined_word ? * move_combined_word : i] = t; */ } else { move_combined_word = -1; } #endif } } if (NULL != sm) { strcat(join, sm+1); strcat(join, subscript_sep_str); } } /* Remove an extra mark, if any */ join_len = strlen(join); if ((SUBSCRIPT_SEP == join[join_len-1]) || (SUBSCRIPT_MARK == join[join_len-1])) join[join_len-1] = '\0'; gwordlist_append(&n_lwg_path, sentence_word); t = string_set_add(join, sent->string_set); free(join); i += mcnt-1; } } } if (!join_alt) gwordlist_append(&n_lwg_path, *wgp); /* * Add guess marks in [] square brackets, if needed, at the * end of the base word. Convert the badly-printing * SUBSCRIPT_MARK (hex 03 or ^C) into a period. */ if (t) { s = strdupa(t); sm = strrchr(s, SUBSCRIPT_MARK); if (sm) *sm = SUBSCRIPT_DOT; if ((!(w->status & WS_GUESS) && (w->status & WS_INDICT)) || !DISPLAY_GUESS_MARKS) { t = string_set_add(s, sent->string_set); } else { const char *regex_name = w->regex_name; /* 4 = 1(null) + 1(guess_mark) + 2 (sizeof "[]") */ int baselen = NULL == sm ? strlen(t) : (size_t)(sm-s); char guess_mark = 0; switch (w->status & WS_GUESS) { case WS_SPELL: guess_mark = GM_SPELL; break; case WS_RUNON: guess_mark = GM_RUNON; break; case WS_REGEX: guess_mark = GM_REGEX; break; case 0: guess_mark = GM_UNKNOWN; break; default: assert(0, "Missing 'case: %2x'", w->status & WS_GUESS); } /* In the case of display_morphology==0, the guess indication of * the last subword is used as the guess indication of the whole * word. * FIXME? The guess indications of other subwords are ignored in * this mode. This implies that if a first or middle subword has * a guess indication but the last subword doesn't have, no guess * indication would be shown at all. */ if ((NULL == regex_name) || HIDE_MORPHO) regex_name = ""; s = alloca(strlen(t) + strlen(regex_name) + 4); strncpy(s, t, baselen); s[baselen] = '['; s[baselen + 1] = guess_mark; strcpy(s + baselen + 2, regex_name); strcat(s, "]"); if (NULL != sm) strcat(s, sm); t = string_set_add(s, sent->string_set); } } } assert(t != NULL, "Word %zu: NULL", i); chosen_words[i] = t; } /* Conditional test removal of quotation marks and the "capdict" tokens, * to facilitate using diff on sentence batch runs. */ if (test_enabled("removeZZZ")) { for (i=0, j=0; i<linkage->num_links; i++) { Link *lnk = &(linkage->link_array[i]); if (0 == strcmp("ZZZ", lnk->link_name)) chosen_words[lnk->rw] = NULL; } } /* If morphology printing is being suppressed, then all links * connecting morphemes will be discarded. */ if (HIDE_MORPHO) { /* Discard morphology links. */ for (i=0; i<linkage->num_links; i++) { Link * lnk = &linkage->link_array[i]; if (is_morphology_link(lnk->link_name)) { /* Mark link for discarding. */ lnk->link_name = NULL; } else { /* Mark word for not discarding. */ show_word[lnk->rw] = true; show_word[lnk->lw] = true; } } } /* We alloc a little more than needed, but so what... */ linkage->word = (const char **) exalloc(linkage->num_words*sizeof(char *)); /* Copy over the chosen words, dropping the discarded words. * However, don't discard existing words (chosen_words[i][0]). * Note that if a word only has morphology links and is not combined with * another word, then it will get displayed with no links at all (e.g. * when explicitly specifying root and suffix for debug: root.= =suf */ for (i=0, j=0; i<linkage->num_words; ++i) { if (chosen_words[i] && (chosen_words[i][0] || (!HIDE_MORPHO || show_word[i]))) { const char *cwtmp = linkage->word[j]; linkage->word[j] = chosen_words[i]; chosen_words[i] = cwtmp; remap[i] = j; j++; } else { remap[i] = -1; } } linkage->num_words = j; remap_linkages(linkage, remap); /* Update linkage->link_array / num_links. */ linkage->wg_path_display = n_lwg_path; if (verbosity_level(D_CCW)) print_lwg_path(n_lwg_path, "Display"); }