/** * Append an unmarked (i.e. without INFIXMARK) morpheme to join_buff. * join_buff is a zeroed-out buffer which has enough room for morpheme to be * added + terminating NUL. * Note that MT_PREFIX or MT_SUFFIX can be without an INFIX_MARK, in case * INFIX_MARK is not defined. XXX: What about MT_MIDDLE? (not in use yet). * * FIXME Combining contracted words is not handled yet, because combining * morphemes which have non-LL links to other words is not yet implemented. */ static void add_morpheme_unmarked(Sentence sent, char *join_buff, const char *wm, Morpheme_type mt) { const char infix_mark = INFIX_MARK(sent->dict->affix_table); const char *sm = strrchr(wm, SUBSCRIPT_MARK); if (NULL == sm) sm = (char *)wm + strlen(wm); if ((MT_PREFIX == mt) && (infix_mark == sm[-INFIX_MARK_L])) strncat(join_buff, wm, sm-wm-INFIX_MARK_L); else if ((MT_SUFFIX == mt) && (infix_mark == wm[0])) strncat(join_buff, INFIX_MARK_L+wm, sm-wm-INFIX_MARK_L); else if ((MT_MIDDLE == mt)) strncat(join_buff, INFIX_MARK_L+wm, sm-wm-2*INFIX_MARK_L); else strncat(join_buff, wm, sm-wm); }
/** * Split randomly. * Return true on success. * Return false when: * - disabled (i.e. when doing regular language processing). * - an error occurs (the behavior then is undefined). * Such an error has not been observed yet. */ bool anysplit(Sentence sent, const char *word) { Dictionary afdict = sent->dict->affix_table; anysplit_params *as; Afdict_class * stemsubscr; size_t stemsubscr_len; size_t l = strlen(word); p_list pl; size_t pos; int p; int sample_point; size_t nsplits; size_t rndtried = 0; size_t rndissued = 0; size_t i; unsigned int seed = 0; char *prefix_string = alloca(l+2+1); /* word + ".=" + NUL */ char *suffix_string = alloca(l+1); /* word + NUL */ bool use_sampling = true; const char infix_mark = INFIX_MARK(afdict); if (NULL == afdict) return false; as = afdict->anysplit; if ((NULL == as) || (0 == as->nparts)) return false; /* Anysplit disabled */ if (0 == l) { prt_error("Warning: anysplit(): word length 0\n"); return false; } stemsubscr = AFCLASS(afdict, AFDICT_STEMSUBSCR); stemsubscr_len = (NULL == stemsubscr->string[0]) ? 0 : strlen(stemsubscr->string[0]); /* Don't split morphemes again. If INFIXMARK and/or SUBSCRMARK are * not defined in the affix file, then morphemes may get split again unless * restricted by REGPRE/REGMID/REGSUF. */ if (word[0] == infix_mark) return true; if ((l > stemsubscr_len) && (0 == strcmp(word+l-stemsubscr_len, stemsubscr->string[0]))) return true; // seed = time(NULL)+(unsigned int)(long)&seed; #if DEBUG_ANYSPLIT gw = word; #endif nsplits = split(l, as->nparts, &as->scl[l]); if (0 == nsplits) { prt_error("Warning: anysplit(): split() failed (shouldn't happen)\n"); return false; } if (as->altsmax >= nsplits) { /* Issue everything */ sample_point = -1; use_sampling = false; } lgdebug(+2, "Start%s sampling: word=%s, nsplits=%zu, maxsplits=%d, " "as->altsmin=%zu, as->altsmax=%zu\n", use_sampling ? "" : " no", word, nsplits, as->nparts, as->altsmin, as->altsmax); while (rndtried < nsplits && (!use_sampling || (rndissued < as->altsmin))) { if (use_sampling) { sample_point = rng_uniform(&seed, nsplits); if (sample_point < 0) /* Cannot happen with rand_r() */ { prt_error("Error: rng: %s\n", strerror(errno)); return false; } } else { sample_point++; } lgdebug(2, "Sample: %d ", sample_point); if (as->scl[l].p_tried[sample_point]) { lgdebug(4, "(repeated)\n"); continue; } lgdebug(4, "(new)"); rndtried++; as->scl[l].p_tried[sample_point] = true; if (morpheme_match(sent, word, l, &as->scl[l].sp[sample_point*as->nparts])) { as->scl[l].p_selected[sample_point] = true; rndissued++; } else { lgdebug(2, "\n"); } } lgdebug(2, "Results: word '%s' (length=%zu): %zu/%zu:\n", word, l, rndissued, nsplits); for (i = 0; i < nsplits; i++) { const char **suffixes = NULL; int num_suffixes = 0; if (!as->scl[l].p_selected[i]) continue; pl = &as->scl[l].sp[i*as->nparts]; pos = 0; for (p = 0; p < as->nparts; p++) { if (pl[0] == (int)l) /* This is the whole word */ { strncpy(prefix_string, &word[pos], pl[p]-pos); prefix_string[pl[p]-pos] = '\0'; } else if (0 == pos) /* The first but not the only morpheme */ { strncpy(prefix_string, &word[pos], pl[p]-pos); prefix_string[pl[p]-pos] = '\0'; if (0 != stemsubscr->length) strcat(prefix_string, stemsubscr->string[0]); } else /* 2nd and on morphemes */ { strncpy(suffix_string, &word[pos], pl[p]-pos); suffix_string[pl[p]-pos] = '\0'; altappend(sent, &suffixes, suffix_string); num_suffixes++; } pos = pl[p]; if (pos == l) break; } /* Here a leading INFIX_MARK is added to the suffixes if needed. */ add_alternative(sent, 0,NULL, 1,(const char **)&prefix_string, num_suffixes,suffixes); free(suffixes); } return true; }