/** * Concatenate the definitions for the given affix class. * This allows specifying the characters in different definitions * instead in a one long string, e.g. instead of: * ""«»《》【】『』`„": QUOTES+; * One can specify (note the added spaces): * """ «» 《》 【】 『』 ` „: QUOTES+; * Or even: * """: QUOTES+; * «» : QUOTES+; * etc. * Note that if there are no definitions or only one definition, there is * nothing to do. * The result is written to the first entry. * @param classno The given affix class. */ static void concat_class(Dictionary afdict, int classno) { Afdict_class * ac; size_t i; dyn_str * qs; ac = AFCLASS(afdict, classno); if (1 >= ac->length) return; qs = dyn_str_new(); for (i = 0; i < ac->length; i++) dyn_strcat(qs, ac->string[i]); ac->string[0] = string_set_add(qs->str, afdict->string_set); dyn_str_delete(qs); }
/** * Convert a list of utf8 chars to wide-chars. The reason for doing * this is kind-of dorky: its so that we can easily find, * character-by-character, if a given character is a quotation mark * or a bullet. This works only because the quotation marks and * bullets are exactly one (wide) character in length. I would like * it better if we didn't do this wide-char conversion, since wide-chars * are badly-behaved in crazy locales, and on MS Windows. */ static bool afdict_to_wide(Dictionary afdict, int classno) { Afdict_class * ac; wchar_t * wqs; mbstate_t mbs; size_t i; int w; dyn_str * qs; const char *pqs; ac = AFCLASS(afdict, classno); if (0 == ac->length) return true; qs = dyn_str_new(); for (i = 0; i < ac->length; i++) dyn_strcat(qs, ac->string[i]); /* * Convert utf8 to wide chars before use. * In case of error the result is undefined. */ pqs = qs->str; memset(&mbs, 0, sizeof(mbs)); w = mbsrtowcs(NULL, &pqs, 0, &mbs); if (0 > w) { prt_error("Error: Affix dictionary: %s: " "Invalid utf8 character\n", afdict_classname[classno]); return false; } /* Store the wide char version at the AFCLASS entry. */ ac->mem_elems = sizeof(*wqs) * (w+1); /* bytes here, but we don't care */ ac->string = malloc(ac->mem_elems); wqs = (wchar_t *)ac->string; pqs = qs->str; (void)mbsrtowcs(wqs, &pqs, w, &mbs); wqs[w] = L'\0'; dyn_str_delete(qs); return true; }
static bool afdict_init(Dictionary dict) { Afdict_class * ac; Dictionary afdict = dict->affix_table; /* FIXME: read_entry() builds word lists in reverse order (can we * just create the list top-down without breaking anything?). Unless * it is fixed to preserve the order, reverse here the word list for * each affix class. */ for (ac = afdict->afdict_class; ac < &afdict->afdict_class[ARRAY_SIZE(afdict_classname)]; ac++) { int i; int l = ac->length - 1; const char * t; for (i = 0; i < l; i++, l--) { t = ac->string[i]; ac->string[i] = ac->string[l]; ac->string[l] = t; } } /* Create the affix lists */ ac = AFCLASS(afdict, AFDICT_INFIXMARK); if ((1 < ac->length) || ((1 == ac->length) && (1 != strlen(ac->string[0])))) { prt_error("Error: afdict_init: Invalid value for class %s in file %s" " (should have been one ASCII punctuation - ignored)\n", afdict_classname[AFDICT_INFIXMARK], afdict->name); free((void *)ac->string); ac->length = 0; ac->mem_elems = 0; ac->string = NULL; } /* XXX For now there is a possibility to use predefined SUF and PRE lists. * So if SUF or PRE are defined, don't extract any of them from the dict. */ if (1 == ac->length) { if ((0 == AFCLASS(afdict, AFDICT_PRE)->length) && (0 == AFCLASS(afdict, AFDICT_SUF)->length)) { char last_entry[MAX_WORD+1] = ""; get_dict_affixes(dict, dict->root, ac->string[0][0], last_entry); } } else { /* No INFIX_MARK - create a dummy one that always mismatches */ affix_list_add(afdict, &afdict->afdict_class[AFDICT_INFIXMARK], ""); } if (debug_level(+D_AI)) { size_t l; for (ac = afdict->afdict_class; ac < &afdict->afdict_class[ARRAY_SIZE(afdict_classname)]; ac++) { if (0 == ac->length) continue; lgdebug(+0, "Class %s, %zd items:", afdict_classname[ac-afdict->afdict_class], ac->length); for (l = 0; l < ac->length; l++) lgdebug(0, " '%s'", ac->string[l]); lgdebug(0, "\n"); } } #undef D_AI /* Store the SANEMORPHISM regex in the unused (up to now) * regex_root element of the affix dictionary, and precompile it */ assert(NULL == afdict->regex_root, "SM regex is already assigned"); ac = AFCLASS(afdict, AFDICT_SANEMORPHISM); if (0 != ac->length) { int rc; Regex_node *sm_re = malloc(sizeof(*sm_re)); dyn_str *rebuf = dyn_str_new(); /* The regex used to be converted to: ^((original-regex)b)+$ * In the initial wordgraph version word boundaries are not supported, * so instead it is converted to: ^(original-regex)+$ */ #ifdef WORD_BOUNDARIES dyn_strcat(rebuf, "^(("); #else dyn_strcat(rebuf, "^("); #endif dyn_strcat(rebuf, ac->string[0]); #ifdef WORD_BOUNDARIES dyn_strcat(rebuf, ")b)+$"); #else dyn_strcat(rebuf, ")+$"); #endif sm_re->pattern = strdup(rebuf->str); dyn_str_delete(rebuf); afdict->regex_root = sm_re; sm_re->name = strdup(afdict_classname[AFDICT_SANEMORPHISM]); sm_re->re = NULL; sm_re->next = NULL; sm_re->neg = false; rc = compile_regexs(afdict->regex_root, afdict); if (rc) { prt_error("Error: afdict_init: Failed to compile " "regex '%s' in file %s, return code %d\n", afdict_classname[AFDICT_SANEMORPHISM], afdict->name, rc); return false; } lgdebug(+5, "%s regex %s\n", afdict_classname[AFDICT_SANEMORPHISM], sm_re->pattern); } /* sort the UNITS list */ /* Longer unit names must get split off before shorter ones. * This prevents single-letter splits from screwing things * up. e.g. split 7gram before 7am before 7m */ ac = AFCLASS(afdict, AFDICT_UNITS); if (0 < ac->length) { qsort(ac->string, ac->length, sizeof(char *), cmplen); } #ifdef AFDICT_ORDER_NOT_PRESERVED /* pre-sort the MPRE list */ ac = AFCLASS(afdict, AFDICT_MPRE); if (0 < ac->length) { /* Longer subwords have priority over shorter ones, * reverse-sort by length. * XXX mprefix_split() for Hebrew depends on that. */ qsort(ac->string, ac->length, sizeof(char *), revcmplen); } #endif /* AFDICT_ORDER_NOT_PRESERVED */ concat_class(afdict, AFDICT_QUOTES); concat_class(afdict, AFDICT_BULLETS); if (! anysplit_init(afdict)) return false; return true; }
/** * Split randomly. * Return true on success. * Return false when: * - disabled (i.e. when doing regular language processing). * - an error occurs (the behavior then is undefined). * Such an error has not been observed yet. */ bool anysplit(Sentence sent, const char *word) { Dictionary afdict = sent->dict->affix_table; anysplit_params *as; Afdict_class * stemsubscr; size_t stemsubscr_len; size_t l = strlen(word); p_list pl; size_t pos; int p; int sample_point; size_t nsplits; size_t rndtried = 0; size_t rndissued = 0; size_t i; unsigned int seed = 0; char *prefix_string = alloca(l+2+1); /* word + ".=" + NUL */ char *suffix_string = alloca(l+1); /* word + NUL */ bool use_sampling = true; const char infix_mark = INFIX_MARK(afdict); if (NULL == afdict) return false; as = afdict->anysplit; if ((NULL == as) || (0 == as->nparts)) return false; /* Anysplit disabled */ if (0 == l) { prt_error("Warning: anysplit(): word length 0\n"); return false; } stemsubscr = AFCLASS(afdict, AFDICT_STEMSUBSCR); stemsubscr_len = (NULL == stemsubscr->string[0]) ? 0 : strlen(stemsubscr->string[0]); /* Don't split morphemes again. If INFIXMARK and/or SUBSCRMARK are * not defined in the affix file, then morphemes may get split again unless * restricted by REGPRE/REGMID/REGSUF. */ if (word[0] == infix_mark) return true; if ((l > stemsubscr_len) && (0 == strcmp(word+l-stemsubscr_len, stemsubscr->string[0]))) return true; // seed = time(NULL)+(unsigned int)(long)&seed; #if DEBUG_ANYSPLIT gw = word; #endif nsplits = split(l, as->nparts, &as->scl[l]); if (0 == nsplits) { prt_error("Warning: anysplit(): split() failed (shouldn't happen)\n"); return false; } if (as->altsmax >= nsplits) { /* Issue everything */ sample_point = -1; use_sampling = false; } lgdebug(+2, "Start%s sampling: word=%s, nsplits=%zu, maxsplits=%d, " "as->altsmin=%zu, as->altsmax=%zu\n", use_sampling ? "" : " no", word, nsplits, as->nparts, as->altsmin, as->altsmax); while (rndtried < nsplits && (!use_sampling || (rndissued < as->altsmin))) { if (use_sampling) { sample_point = rng_uniform(&seed, nsplits); if (sample_point < 0) /* Cannot happen with rand_r() */ { prt_error("Error: rng: %s\n", strerror(errno)); return false; } } else { sample_point++; } lgdebug(2, "Sample: %d ", sample_point); if (as->scl[l].p_tried[sample_point]) { lgdebug(4, "(repeated)\n"); continue; } lgdebug(4, "(new)"); rndtried++; as->scl[l].p_tried[sample_point] = true; if (morpheme_match(sent, word, l, &as->scl[l].sp[sample_point*as->nparts])) { as->scl[l].p_selected[sample_point] = true; rndissued++; } else { lgdebug(2, "\n"); } } lgdebug(2, "Results: word '%s' (length=%zu): %zu/%zu:\n", word, l, rndissued, nsplits); for (i = 0; i < nsplits; i++) { const char **suffixes = NULL; int num_suffixes = 0; if (!as->scl[l].p_selected[i]) continue; pl = &as->scl[l].sp[i*as->nparts]; pos = 0; for (p = 0; p < as->nparts; p++) { if (pl[0] == (int)l) /* This is the whole word */ { strncpy(prefix_string, &word[pos], pl[p]-pos); prefix_string[pl[p]-pos] = '\0'; } else if (0 == pos) /* The first but not the only morpheme */ { strncpy(prefix_string, &word[pos], pl[p]-pos); prefix_string[pl[p]-pos] = '\0'; if (0 != stemsubscr->length) strcat(prefix_string, stemsubscr->string[0]); } else /* 2nd and on morphemes */ { strncpy(suffix_string, &word[pos], pl[p]-pos); suffix_string[pl[p]-pos] = '\0'; altappend(sent, &suffixes, suffix_string); num_suffixes++; } pos = pl[p]; if (pos == l) break; } /* Here a leading INFIX_MARK is added to the suffixes if needed. */ add_alternative(sent, 0,NULL, 1,(const char **)&prefix_string, num_suffixes,suffixes); free(suffixes); } return true; }
/** * Initialize the anysplit parameter and cache structure. */ bool anysplit_init(Dictionary afdict) { anysplit_params *as; size_t i; Afdict_class *regpre = AFCLASS(afdict, AFDICT_REGPRE); Afdict_class *regmid = AFCLASS(afdict, AFDICT_REGMID); Afdict_class *regsuf = AFCLASS(afdict, AFDICT_REGSUF); Afdict_class *regalts = AFCLASS(afdict, AFDICT_REGALTS); Afdict_class *regparts = AFCLASS(afdict, AFDICT_REGPARTS); if (0 == regparts->length) { /* FIXME: Early assignment of verbosity by -v=x argument. */ if (verbosity > 1) prt_error("Warning: File %s: Anysplit disabled (%s not defined)", afdict->name, afdict_classname[AFDICT_REGPARTS]); return true; } if (1 != regparts->length) { prt_error("Error: File %s: Must have %s defined with one value", afdict->name, afdict_classname[AFDICT_REGPARTS]); return false; } as = malloc(sizeof(anysplit_params)); for (i = 0; i < NUMELEMS(as->scl); i++) as->scl[i].sp = NULL; afdict->anysplit = as; as->regpre = regbuild(regpre->string, regpre->length, AFDICT_REGPRE); as->regmid = regbuild(regmid->string, regmid->length, AFDICT_REGMID); as->regsuf = regbuild(regsuf->string, regsuf->length, AFDICT_REGSUF); if (compile_regexs(as->regpre, NULL) != 0) return false; if (compile_regexs(as->regmid, NULL) != 0) return false; if (compile_regexs(as->regsuf, NULL) != 0) return false; as->nparts = atoi(regparts->string[0]); if (as->nparts < 0) { prt_error("Error: File %s: Value of %s must be a non-negative number", afdict->name, afdict_classname[AFDICT_REGPARTS]); return false; } if (0 == as->nparts) { prt_error("Warning: File %s: Anysplit disabled (0: %s)\n", afdict->name, afdict_classname[AFDICT_REGPARTS]); return true; } if (2 != regalts->length) { prt_error("Error: File %s: Must have %s defined with 2 values", afdict->name, afdict_classname[AFDICT_REGALTS]); return false; } as->altsmin = atoi(regalts->string[0]); as->altsmax = atoi(regalts->string[1]); if ((atoi(regalts->string[0]) <= 0) || (atoi(regalts->string[1]) <= 0)) { prt_error("Error: File %s: Value of %s must be 2 positive numbers", afdict->name, afdict_classname[AFDICT_REGALTS]); return false; } return true; }