bool sane_linkage_morphism(Sentence sent, Linkage lkg, Parse_Options opts) { Wordgraph_pathpos *wp_new = NULL; Wordgraph_pathpos *wp_old = NULL; Wordgraph_pathpos *wpp; Gword **next; /* next Wordgraph words of the current word */ size_t i; Linkage_info * const lifo = &lkg->lifo; bool match_found = true; /* if all the words are null - it's still a match */ Gword **lwg_path; Dictionary afdict = sent->dict->affix_table; /* for SANEMORPHISM */ char *const affix_types = alloca(sent->length*2 + 1); /* affix types */ affix_types[0] = '\0'; /* Populate the path word queue, initializing the path to NULL. */ for (next = sent->wordgraph->next; *next; next++) { wordgraph_path_append(&wp_new, /*path*/NULL, /*add_word*/NULL, *next); } assert(NULL != wp_new, "Path word queue is empty"); for (i = 0; i < lkg->num_words; i++) { Disjunct *cdj; /* chosen disjunct */ lgdebug(D_SLM, "%p Word %zu: ", lkg, i); if (NULL == wp_new) { lgdebug(+D_SLM, "- No more words in the wordgraph\n"); match_found = false; break; } if (wp_old != wp_new) { wordgraph_path_free(wp_old, true); wp_old = wp_new; } wp_new = NULL; //wordgraph_pathpos_print(wp_old); cdj = lkg->chosen_disjuncts[i]; /* Handle null words */ if (NULL == cdj) { lgdebug(D_SLM, "- Null word\n"); /* A null word matches any word in the Wordgraph - * so, unconditionally proceed in all paths in parallel. */ match_found = false; for (wpp = wp_old; NULL != wpp->word; wpp++) { if (NULL == wpp->word->next) continue; /* This path encountered the Wordgraph end */ /* The null words cannot be marked here because wpp->path consists * of pointers to the Wordgraph words, and these words are common to * all the linkages, with potentially different null words in each * of them. However, the position of the null words can be inferred * from the null words in the word array of the Linkage structure. */ for (next = wpp->word->next; NULL != *next; next++) { match_found = true; wordgraph_path_append(&wp_new, wpp->path, wpp->word, *next); } } continue; } if (!match_found) { const char *e = "Internal error: Too many words in the linkage\n"; lgdebug(D_SLM, "- %s", e); prt_error("Error: %s.", e); break; } assert(MT_EMPTY != cdj->word[0]->morpheme_type); /* already discarded */ if (debug_level(D_SLM)) print_with_subscript_dot(cdj->string); match_found = false; /* Proceed in all the paths in which the word is found. */ for (wpp = wp_old; NULL != wpp->word; wpp++) { const Gword **wlp; /* disjunct word list */ for (wlp = cdj->word; *wlp; wlp++) { if (*wlp == wpp->word) { match_found = true; for (next = wpp->word->next; NULL != *next; next++) { wordgraph_path_append(&wp_new, wpp->path, wpp->word, *next); } break; } } } if (!match_found) { /* FIXME? A message can be added here if there are too many words * in the linkage (can happen only if there is an internal error). */ lgdebug(D_SLM, "- No Wordgraph match\n"); break; } lgdebug(D_SLM, "\n"); } if (match_found) { match_found = false; /* Validate that there are no missing words in the linkage. It is so if * the dummy termination word is found in the new pathpos queue. */ if (NULL != wp_new) { for (wpp = wp_new; NULL != wpp->word; wpp++) { if (MT_INFRASTRUCTURE == wpp->word->morpheme_type) { match_found = true; /* Exit the loop with with wpp of the termination word. */ break; } } } if (!match_found) lgdebug(D_SLM, "%p Missing word(s) at the end of the linkage.\n", lkg); } #define DEBUG_morpheme_type 0 /* Check the morpheme type combination. * If null_count > 0, the morpheme type combination may be invalid * due to null subwords, so skip this check. */ if (match_found && (0 == sent->null_count) && (NULL != afdict) && (NULL != afdict->regex_root)) { const Gword **w; char *affix_types_p = affix_types; /* Construct the affix_types string. */ #if DEBUG_morpheme_type print_lwg_path(wpp->path); #endif i = 0; for (w = wpp->path; *w; w++) { i++; if (MT_EMPTY == (*w)->morpheme_type) continue; /* really a null word */ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wswitch-enum" switch ((*w)->morpheme_type) { #pragma GCC diagnostic pop default: /* What to do with the rest? */ case MT_WORD: *affix_types_p = AFFIXTYPE_WORD; break; case MT_PREFIX: *affix_types_p = AFFIXTYPE_PREFIX; break; case MT_STEM: *affix_types_p = AFFIXTYPE_STEM; break; case MT_MIDDLE: *affix_types_p = AFFIXTYPE_MIDDLE; break; case MT_SUFFIX: *affix_types_p = AFFIXTYPE_SUFFIX; break; } #if DEBUG_morpheme_type lgdebug(D_SLM, "Word %zu: %s affixtype=%c\n", i, (*w)->subword, *affix_types_p); #endif affix_types_p++; } *affix_types_p = '\0'; #ifdef WORD_BOUNDARIES /* not yet implemented */ { const Gword *uw; /* If w is an "end subword", return its unsplit word, else NULL. */ uw = word_boundary(w); /* word_boundary() unimplemented */ if (NULL != uw) { *affix_types_p++ = AFFIXTYPE_END; lgdebug(D_SLM, "%p End of Gword %s\n", lkg, uw->subword); } } #endif /* Check if affix_types is valid according to SANEMORPHISM. */ if (('\0' != affix_types[0]) && (NULL == match_regex(afdict->regex_root, affix_types))) { /* Morpheme type combination is invalid */ match_found = false; /* Notify to stdout, so it will be shown along with the result. * XXX We should have a better way to notify. */ if (0 < opts->verbosity) printf("Warning: Invalid morpheme type combination '%s', " "run with !bad and !verbosity=4 to debug\n", affix_types); } } if (match_found) lwg_path = (Gword **)wpp->path; /* OK to modify */ wordgraph_path_free(wp_old, true); wordgraph_path_free(wp_new, !match_found); if (match_found) { if ('\0' != affix_types[0]) { lgdebug(D_SLM, "%p Morpheme type combination '%s'\n", lkg, affix_types); } lgdebug(+D_SLM, "%p SUCCEEDED\n", lkg); lkg->wg_path = lwg_path; return true; } /* Oh no ... invalid morpheme combination! */ sent->num_valid_linkages --; lifo->N_violations++; lifo->pp_violation_msg = "Invalid morphism construction."; lkg->wg_path = NULL; lifo->discarded = true; lgdebug(D_SLM, "%p FAILED\n", lkg); return false; }
void compute_chosen_words(Sentence sent, Linkage linkage, Parse_Options opts) { WordIdx i; /* index of chosen_words */ WordIdx j; Disjunct **cdjp = linkage->chosen_disjuncts; const char **chosen_words = alloca(linkage->num_words * sizeof(*chosen_words)); int *remap = alloca(linkage->num_words * sizeof(*remap)); bool *show_word = alloca(linkage->num_words * sizeof(*show_word)); bool display_morphology = opts->display_morphology; Gword **lwg_path = linkage->wg_path; Gword **n_lwg_path = NULL; /* new Wordgraph path, to match chosen_words */ Gword **nullblock_start = NULL; /* start of a null block, to be put in [] */ size_t nbsize = 0; /* number of word in a null block */ Gword *sentence_word; memset(show_word, 0, linkage->num_words * sizeof(*show_word)); if (verbosity_level(D_CCW)) print_lwg_path(lwg_path, "Linkage"); for (i = 0; i < linkage->num_words; i++) { Disjunct *cdj = cdjp[i]; Gword *w; /* current word */ const Gword *nw; /* next word (NULL if none) */ Gword **wgp; /* wordgraph_path traversing pointer */ const char *t = NULL; /* current word string */ bool at_nullblock_end; /* current word is at end of a nullblock */ bool join_alt = false; /* morpheme-join this alternative */ char *s; size_t l; size_t m; lgdebug(D_CCW, "Loop start, word%zu: cdj %s, path %s\n", i, cdj ? cdj->word_string : "NULL", lwg_path[i] ? lwg_path[i]->subword : "NULL"); w = lwg_path[i]; nw = lwg_path[i+1]; wgp = &lwg_path[i]; sentence_word = wg_get_sentence_word(sent, w); /* FIXME If the original word was capitalized in a capitalizable * position, the displayed null word may be its downcase version. */ if (NULL == cdj) /* a null word (the chosen disjunct was NULL) */ { chosen_words[i] = NULL; nbsize++; if (NULL == nullblock_start) /* it starts a new null block */ nullblock_start = wgp; at_nullblock_end = (NULL == nw) || (wg_get_sentence_word(sent, nw->unsplit_word) != sentence_word); /* Accumulate null words in this alternative */ if (!at_nullblock_end && (NULL == cdjp[i+1]) && ((w->morpheme_type == MT_PUNC) == (nw->morpheme_type == MT_PUNC))) { lgdebug(D_CCW, "Skipping word%zu cdjp=NULL#%zu, path %s\n", i, nbsize, w->subword); chosen_words[i] = NULL; continue; } if (NULL != nullblock_start) { /* If we are here, this null word is an end of a null block */ lgdebug(+D_CCW, "Handling %zu null words at %zu: ", nbsize, i); if (1 == nbsize) { /* Case 1: A single null subword. */ lgdebug(D_CCW, "A single null subword.\n"); t = join_null_word(sent, wgp, nbsize); gwordlist_append(&n_lwg_path, w); } else { lgdebug(D_CCW, "Combining null subwords"); /* Use alternative_id to check for start of alternative. */ if (((*nullblock_start)->alternative_id == *nullblock_start) && at_nullblock_end) { /* Case 2: A null unsplit_word (all-nulls alternative).*/ lgdebug(D_CCW, " (null alternative)\n"); t = sentence_word->subword; gwordlist_append(&n_lwg_path, sentence_word); } else { /* Case 3: Join together >=2 null morphemes. */ Gword *wgnull; lgdebug(D_CCW, " (null partial word)\n"); wgnull = wordgraph_null_join(sent, wgp-nbsize+1, wgp); gwordlist_append(&n_lwg_path, wgnull); t = wgnull->subword; } } nullblock_start = NULL; nbsize = 0; show_word[i] = true; if (MT_WALL != w->morpheme_type) { /* Put brackets around the null word. */ l = strlen(t) + 2; s = (char *) alloca(l+1); s[0] = NULLWORD_START; strcpy(&s[1], t); s[l-1] = NULLWORD_END; s[l] = '\0'; t = string_set_add(s, sent->string_set); lgdebug(D_CCW, " %s\n", t); /* Null words have no links, so take care not to drop them. */ } } } else { /* This word has a linkage. */ /* TODO: Suppress "virtual-morphemes", currently the dictcap ones. */ char *sm; t = cdj->word_string; /* Print the subscript, as in "dog.n" as opposed to "dog". */ if (0) { /* TODO */ } else { /* Get rid of those ugly ".Ixx" */ if (is_idiom_word(t)) { s = strdupa(t); sm = strrchr(s, SUBSCRIPT_MARK); /* Possible double subscript. */ UNREACHABLE(NULL == sm); /* We know it has a subscript. */ *sm = '\0'; t = string_set_add(s, sent->string_set); } else if (HIDE_MORPHO) { /* Concatenate the word morphemes together into one word. * Concatenate their subscripts into one subscript. * Use subscript separator SUBSCRIPT_SEP. * XXX Check whether we can encounter an idiom word here. * FIXME Combining contracted words is not handled yet, because * combining morphemes which have non-LL links to other words is * not yet implemented. * FIXME Move to a separate function. */ Gword **wgaltp; size_t join_len = 0; size_t mcnt = 0; /* If the alternative contains morpheme subwords, mark it * for joining... */ const Gword *unsplit_word = w->unsplit_word; for (wgaltp = wgp, j = i; NULL != *wgaltp; wgaltp++, j++) { if ((*wgaltp)->unsplit_word != unsplit_word) break; if (MT_INFRASTRUCTURE == (*wgaltp)->unsplit_word->morpheme_type) break; mcnt++; if (NULL == cdjp[j]) { /* ... but not if it contains a null word */ join_alt = false; break; } join_len += strlen(cdjp[j]->word_string) + 1; if ((*wgaltp)->morpheme_type & IS_REG_MORPHEME) join_alt = true; } if (join_alt) { /* Join it in two steps: 1. Base words. 2. Subscripts. * FIXME? Can be done in one step (more efficient but maybe * less clear). * Put SUBSCRIPT_SEP between the subscripts. * XXX No 1-1 correspondence between the hidden base words * and the subscripts after the join, in case there are base * words with and without subscripts. */ const char subscript_sep_str[] = { SUBSCRIPT_SEP, '\0'}; char *join = calloc(join_len + 1, 1); /* zeroed out */ join[0] = '\0'; /* 1. Join base words. (Could just use the unsplit_word.) */ for (wgaltp = wgp, m = 0; m < mcnt; wgaltp++, m++) { add_morpheme_unmarked(sent, join, cdjp[i+m]->word_string, (*wgaltp)->morpheme_type); } strcat(join, subscript_mark_str()); /* tentative */ /* 2. Join subscripts. */ for (wgaltp = wgp, m = 0; m < mcnt; wgaltp++, m++) { /* Cannot NULLify the word - we may have links to it. */ if (m != mcnt-1) chosen_words[i+m] = ""; sm = strchr(cdjp[i+m]->word_string, SUBSCRIPT_MARK); if (NULL != sm) { /* Supposing stem subscript is .=x (x optional) */ if (MT_STEM == (*wgaltp)->morpheme_type) { sm += 1 + STEM_MARK_L; /* sm+strlen(".=") */ if ('\0' == *sm) sm = NULL; #if 0 if ((cnt-1) == m) { /* Support a prefix-stem combination. In that case * we have just nullified the combined word, so we * need to move it to the position of the prefix. * FIXME: May still not be good enough. */ move_combined_word = i+m-1; /* And the later chosen_word assignment should be: * chosen_words[-1 != move_combined_word ? * move_combined_word : i] = t; */ } else { move_combined_word = -1; } #endif } } if (NULL != sm) { strcat(join, sm+1); strcat(join, subscript_sep_str); } } /* Remove an extra mark, if any */ join_len = strlen(join); if ((SUBSCRIPT_SEP == join[join_len-1]) || (SUBSCRIPT_MARK == join[join_len-1])) join[join_len-1] = '\0'; gwordlist_append(&n_lwg_path, sentence_word); t = string_set_add(join, sent->string_set); free(join); i += mcnt-1; } } } if (!join_alt) gwordlist_append(&n_lwg_path, *wgp); /* * Add guess marks in [] square brackets, if needed, at the * end of the base word. Convert the badly-printing * SUBSCRIPT_MARK (hex 03 or ^C) into a period. */ if (t) { s = strdupa(t); sm = strrchr(s, SUBSCRIPT_MARK); if (sm) *sm = SUBSCRIPT_DOT; if ((!(w->status & WS_GUESS) && (w->status & WS_INDICT)) || !DISPLAY_GUESS_MARKS) { t = string_set_add(s, sent->string_set); } else { const char *regex_name = w->regex_name; /* 4 = 1(null) + 1(guess_mark) + 2 (sizeof "[]") */ int baselen = NULL == sm ? strlen(t) : (size_t)(sm-s); char guess_mark = 0; switch (w->status & WS_GUESS) { case WS_SPELL: guess_mark = GM_SPELL; break; case WS_RUNON: guess_mark = GM_RUNON; break; case WS_REGEX: guess_mark = GM_REGEX; break; case 0: guess_mark = GM_UNKNOWN; break; default: assert(0, "Missing 'case: %2x'", w->status & WS_GUESS); } /* In the case of display_morphology==0, the guess indication of * the last subword is used as the guess indication of the whole * word. * FIXME? The guess indications of other subwords are ignored in * this mode. This implies that if a first or middle subword has * a guess indication but the last subword doesn't have, no guess * indication would be shown at all. */ if ((NULL == regex_name) || HIDE_MORPHO) regex_name = ""; s = alloca(strlen(t) + strlen(regex_name) + 4); strncpy(s, t, baselen); s[baselen] = '['; s[baselen + 1] = guess_mark; strcpy(s + baselen + 2, regex_name); strcat(s, "]"); if (NULL != sm) strcat(s, sm); t = string_set_add(s, sent->string_set); } } } assert(t != NULL, "Word %zu: NULL", i); chosen_words[i] = t; } /* Conditional test removal of quotation marks and the "capdict" tokens, * to facilitate using diff on sentence batch runs. */ if (test_enabled("removeZZZ")) { for (i=0, j=0; i<linkage->num_links; i++) { Link *lnk = &(linkage->link_array[i]); if (0 == strcmp("ZZZ", lnk->link_name)) chosen_words[lnk->rw] = NULL; } } /* If morphology printing is being suppressed, then all links * connecting morphemes will be discarded. */ if (HIDE_MORPHO) { /* Discard morphology links. */ for (i=0; i<linkage->num_links; i++) { Link * lnk = &linkage->link_array[i]; if (is_morphology_link(lnk->link_name)) { /* Mark link for discarding. */ lnk->link_name = NULL; } else { /* Mark word for not discarding. */ show_word[lnk->rw] = true; show_word[lnk->lw] = true; } } } /* We alloc a little more than needed, but so what... */ linkage->word = (const char **) exalloc(linkage->num_words*sizeof(char *)); /* Copy over the chosen words, dropping the discarded words. * However, don't discard existing words (chosen_words[i][0]). * Note that if a word only has morphology links and is not combined with * another word, then it will get displayed with no links at all (e.g. * when explicitly specifying root and suffix for debug: root.= =suf */ for (i=0, j=0; i<linkage->num_words; ++i) { if (chosen_words[i] && (chosen_words[i][0] || (!HIDE_MORPHO || show_word[i]))) { const char *cwtmp = linkage->word[j]; linkage->word[j] = chosen_words[i]; chosen_words[i] = cwtmp; remap[i] = j; j++; } else { remap[i] = -1; } } linkage->num_words = j; remap_linkages(linkage, remap); /* Update linkage->link_array / num_links. */ linkage->wg_path_display = n_lwg_path; if (verbosity_level(D_CCW)) print_lwg_path(n_lwg_path, "Display"); }