static void load_affix(Dictionary afdict, Dict_node *dn, int l) { Dict_node * dnx = NULL; for (; NULL != dn; dn = dnx) { char *string; const char *con = word_only_connector(dn); if (NULL == con) { /* ??? should we support here more than one class? */ prt_error("Warning: Word \"%s\" found near line %d of %s.\n" "\tWord has more than one connector.\n" "\tThis word will be ignored.", dn->string, afdict->line_number, afdict->name); return; } /* The affix files serve a dual purpose: they indicate both * what a unit is, connector-wise, and what is strippable, as * a string. When the unit is an 'idiom' (i.e. two words, * e.g. base_pair or degrees_C) then only the first word can * be stripped away from a run-on expression (e.g. "86degrees C") */ if (contains_underbar(dn->string)) { char *p; string = strdup(dn->string); p = string+1; while (*p != '_' && *p != '\0') p++; *p = '\0'; } else { string = deinflect(dn->string); } affix_list_add(afdict, afdict_find(afdict, con, /*notify_err*/true), string); free(string); dnx = dn->left; xfree((char *)dn, sizeof(Dict_node)); } }
/* Was main() of the test program... */ static int regex_split(const char *inpat, int flags, const char *str, Dictionary dict) { const char *p; dyn_str *pat; int plevel; /* paren level */ int cglevel; /* capture group level */ int nplevel; /* paren level within named capture group */ int icgnum; /* capture group number*/ int options; const char *errptr; int erroffset; pcre *pcre; const char * const prog = "regex_tokenizer_test"; int rc; pcre_extra *extra = NULL; #define OVCNT 15 int ovector[OVCNT]; callout_data_t callout_data; #if 0 const char **wordlist; #endif bool word_compare_flag = true; #ifdef notdef dyn_str *wordalts; #endif const char *group_name = NULL; char *word_classname; char c0[2] = "\0\0"; /* FIXME: validate we use PCRE version 2 at least. */ /* Find the number of capturing groups in the input pattern. */ icgnum = 0; for (p = inpat; '\0' != *p; p++) { /* Count as capture groups only (string) or (?<name>). Especially, avoid * counting (?<=...) (positive look behind) and (?(condition)...) (the * (condition) part). * FIXME: support () inside []. * FIXME: support \. */ if ((*p == '(') && (*p != '*') && ((p[1] != '?') || ((p[2] == '<') && (p[3] != '='))) && ((p-inpat < 2) || (p[-2] != '(') || (p[-1] != '?'))) { icgnum++; } } if (0 == icgnum) { printf("%s: pattern must include at least one () group (was: %s)\n", prog, inpat); return 9; } #if 0 if (p[-1] != '$') { /* FIXME: add $ if needed */ printf("%s: pattern must end with $ (was: %s)\n", prog, inpat); return 9; } #endif /* Regex syntax check of the pattern. * FIXME: Add support for "(?J)" */ options = PCRE_UTF8; pcre = pcre_compile(inpat, options, &errptr, &erroffset, NULL); if (NULL == pcre) { printf("%s: pcre_compile: Error in pattern '%s' at offset %d: %s\n", prog, inpat, erroffset, errptr); return 2; } callout_data.wordlist = NULL; callout_data.cgnum = NULL; if (word_compare_flag) { int i; #if 0 callout_data.wordlist = malloc(sizeof(*callout_data.wordlist)*icgnum); #endif callout_data.cgnum = malloc(sizeof(*callout_data.cgnum)*icgnum); //printf("ALLOCATED callout_data.cgnum %ld for %d groups\n", //sizeof(*callout_data.wordlist)*cgnum, icgnum); for (i = 0; i < icgnum; i++) { #if 0 callout_data.wordlist[i] = NULL; #endif callout_data.cgnum[i] = NULL; } } /* Build the pattern that finds all possible matches. */ pat = dyn_str_new(); plevel = 0; cglevel = 0; icgnum = -1; /* First capture group (plevel==1) is icgnum==0. */ /* Convert the input regex to the tokenizer regex. * cglevel counts named capture groups * plevel counts all groups * * FIXME: Add support for: * (?x) - comment mode. * (?i) - ignore case. * \ - backslash for ()<>?* . * [] - () inside it * FIXME: Add "(?: ... )" over the result pattern. */ //dyn_strcat(pat, "(?J)"); for (p = inpat; '\0' != *p; p++) { char *re = NULL; /* a regex from the 4.0.regex file */ switch (*p) { const char *c; case '(': if (cglevel > 0) { printf("Error at position %ld: Tokenizer capture groups cannot have nested groups\n", p-inpat); } plevel++; if ((p[1] == '*') || ((p[1] == '?') && ((p[2] != '<') || (p[3] == '='))) || ((p-inpat > 1) && (p[-2] == '(') && (p[-1] == '?'))) { break; } cglevel++; if (cglevel > 1) { printf("Error at position %ld: Tokenizer aregex cannot have capture group level > 1\n", p-inpat); free(callout_data.cgnum); return 199; } icgnum++; dyn_strcat(pat, "(?:"); group_name = NULL; break; case ')': plevel--; if (cglevel > 0) { cglevel--; /* Add the dict lookup and capturing callback. */ dyn_strcat(pat, ")(?C)"); } group_name = NULL; break; case '<': /* Remember it as a potential start of a named group. */ if ((p-2 >= inpat) && (p[-2] == '(') && (p[-1] == '?') && (p[1] != '=')) { group_name = p + 1; } else group_name = NULL; break; case '>': if (NULL != group_name) { /* Check if this is actually a group name */ for (c = group_name; c < p; c++) { /* FIXME: 'a' and 'p' are part of a hack for lookup_mark. * FIXME: 'r' is part of a hack for regex names that match affix * class names. The fix is not to use matching names. */ if ((*c > 'Z' || *c < 'A') && *c != 'a' && *c != 'p' && *c != 'r') break; } if (c == p) { word_classname = malloc(p-group_name+1); strncpy(word_classname, group_name, p-group_name); word_classname[p-group_name] = '\0'; } else { printf("%s: Invalid class name in group name found at '%s'\n", prog, group_name-4); word_classname = NULL; } } else { word_classname = NULL; } if (!word_classname) { group_name = NULL; break; } dyn_strcat(pat, ">"); lgdebug(6, "Found word-class %s\n", word_classname); #if 0 wordlist = readwords(word_classname); if (NULL == wordlist) { printf("i%s: Invalid class name %s in group name\n", prog, word_classname); return 100; } if (!word_compare_flag) { printf("Invocation without -w is not supported\n"); return 103; } #endif if (word_compare_flag) { char *t; const char *lookup_mark = NULL; #if 0 callout_data.wordlist[icgnum] = wordlist; printf("WORDLIST %p at cgnum %d\n", wordlist, icgnum); #endif /* Allocate per group info */ callout_data.cgnum[icgnum] = malloc(sizeof(*(callout_data.cgnum)[0])); callout_data.cgnum[icgnum]->name = NULL; //printf("ALLOCATED cgnum[%d]=%p\n", icgnum, //callout_data.cgnum[icgnum]); /* A hack for testing: Handle WORDpX or WORDaX. * The above a/p marks mean append/prepend X to word before making * the lookup. * FIXME: Find another way to specify that, maybe in the affix file * or in a tokenizer definition file. */ t = strpbrk(word_classname, "pa"); if (NULL != t) { Afdict_class *ac; callout_data.cgnum[icgnum]->lookup_mark_pos = *t; *t = '\0'; ac = afdict_find(dict->affix_table, t+1, /*notify_err*/false); if (NULL == ac) { printf("%s: Unknown afclass '%s'\n", prog, t+1); return 253; } /* Check if the requested affix class is defined and is not an * empty string (like the default INFIXMARK). */ if (0 == ac->length || '\0' == ac->string[0][0]) { printf("%s: No value for afclass '%s'\n", prog, t+1); return 252; } lookup_mark = ac->string[0]; /* FIXME: support more than one value. */ } callout_data.cgnum[icgnum]->lookup_mark = lookup_mark; callout_data.cgnum[icgnum]->name = word_classname; if (0 == strcmp(word_classname, "DICTWORD")) { /* Assign data for looking up a word in the main dict. */ callout_data.cgnum[icgnum]->dict = dict; callout_data.cgnum[icgnum]->afclass = NULL; } else if (afdict_find(dict->affix_table, word_classname, /*notify_err*/false)) { callout_data.cgnum[icgnum]->dict = dict->affix_table; callout_data.cgnum[icgnum]->afclass = word_classname; } else { if ('r' == word_classname[0]) word_classname++; re = get_regex_by_name(dict, word_classname); if (re) { lgdebug(6, "Regex %s with modified groups: '%s'\n", word_classname, re); callout_data.cgnum[icgnum]->dict = NULL; /* FIXME: No need to allocate callout_data.cgnum[icgnum] in this * case. */ } else { printf("%s: Unknown word classname '%s'\n", prog, word_classname); return 254; } } /* TODO: Assign flags, e.g. for emitting the words with stem/infix marks. */ } else { #if 0 wordalts = make_wordalts(wordlist); dyn_strcat(pat, wordalts->str); dyn_str_delete(wordalts); free(wordlist); #else printf("%s: Invocation without -w is not supported\n", prog); return 103; #endif } /* Default match for dictionary lookup is ".*". * Allow replacing it by something else. * E.g: .{2,}|a */ if (')' == p[1]) { if (NULL == re) { dyn_strcat(pat, ".*"); } else { dyn_strcat(pat, re); free(re); re = NULL; } } else { nplevel = 1; /* FIXME: Add support for: * (?x) - comment mode. * \ - backslash for ()<>?* . * [] - () inside it */ for (; p[1] != '\0' && nplevel > 0; p++) { switch (p[1]) { case '(': if (('?' != p[2]) && ('*' != p[2]) && ((p[-1] != '(') || (p[0] != '?'))) { printf("%s: Capture_group %d: Nested capture group is not supported\n", prog, icgnum+1); return 250; } nplevel++; break; case ')': nplevel--; if (0 == nplevel) continue; /* we are done */ break; } c0[0] = p[1]; dyn_strcat(pat, c0); } p--; } word_classname = NULL; group_name = NULL; continue; } c0[0] = *p; dyn_strcat(pat, c0); } /* Add '$' at the end if needed. */ if ('$' != pat->str[pat->end-1]) dyn_strcat(pat, "$"); /* Add the backtracking callback. */ dyn_strcat(pat, "(?C1)"); printf("Modified pattern: %s", pat->str); lgdebug(2, " (len %zu/%zu)", pat->end, pat->len); printf("\n"); pcre_callout = callout; callout_data.function = 1; callout_data.subp_i = 0; callout_data.subp[0].s = 0; callout_data.subp[0].e = SUBP0END_DEBUG_SIGNATURE; callout_data.subp_ovfl = false; callout_data.capture_last = 0; callout_data.pattern = pat->str; callout_data.alt_counter = 0; options = PCRE_UTF8; pcre = pcre_compile(pat->str, options, &errptr, &erroffset, NULL); if (NULL == pcre) { printf("%s: Internal error: pcre_compile: Error in pattern '%s' at offset %d: %s\n", prog, pat->str, erroffset, errptr); return 99; } /* TODO: Check if using JIT may optimize out some needed callouts. */ options = 0; //PCRE_STUDY_JIT_COMPILE; extra = pcre_study(pcre, options, &errptr); if (NULL == extra) { if (NULL != errptr) { printf("%s: pcre_study: Error for pattern '%s': %s\n", prog, pat->str, errptr); return 3; } extra = malloc(sizeof(*extra)); memset(extra, 0, sizeof(*extra)); } else { /* For some reason JIT is sometimes done even though it was not requested. * But the callouts are still invoked as expected in such cases. */ lgdebug(6, "%s: pcre_study: JIT %ld\n", prog, extra->flags & PCRE_STUDY_JIT_COMPILE); } #if 0 extra->match_limit = 10000; extra->match_limit_recursion = 10000; extra->flags |= PCRE_EXTRA_MATCH_LIMIT|PCRE_EXTRA_MATCH_LIMIT_RECURSION; #endif extra->callout_data = (void *)&callout_data; extra->flags |= PCRE_EXTRA_CALLOUT_DATA; #if 0 printf("CGNUM %d\n", icgnum); if (NULL != callout_data.cgnum) { int i; for (i = 0; i <= icgnum; i++) { printf("callout_data.cgnum[%d] %p\n", i, callout_data.cgnum[i]); } } else printf("CGNUM %p\n", callout_data.cgnum); #endif options = PCRE_ANCHORED; /* XXX Maybe PCRE_NO_START_OPTIMIZE is needed too */ rc = pcre_exec(pcre, extra, str, strlen(str), 0, options, ovector, OVCNT); if (rc < 0) { if (PCRE_ERROR_NOMATCH == rc) { lgdebug(2, "No match (must always happen)\n"); } else { printf("%s: pcre_exec: Error %d\n", prog, rc); } } else { printf("Internal error: Unexpected match, rc=%d\n", rc); } if (0 == rc) { rc = OVCNT/3; printf("ovector only has room for %d captured substrings\n", rc - 1); } printov(str, (ov_t *)ovector, rc, NULL, /*is_pcreov*/true); if (verbosity > 6) { if (0 != callout_data.subp_i) { printf("Callout stack:\n"); printov(str, callout_data.subp, callout_data.subp_i, &callout_data, /*is_pcreov*/false); } } /* Free everything. */ dyn_str_delete(pat); /* note - callback_data uses parts of pat */ pcre_free_study(extra); /* safe even if malloc'ed */ free(pcre); if (NULL != callout_data.cgnum) { int i; for (i = 0; i <= icgnum; i++) { if (callout_data.cgnum[i]) { /* FIXME: Free also word_classname. */ free(callout_data.cgnum[i]); } } free(callout_data.cgnum); } #if 0 if (NULL != callout_data.wordlist) { int i; for (i = 0; i < icgnum; i++) { free(callout_data.wordlist[i]); } free(callout_data.wordlist); } #endif return 0; }
/** * Compare a portion of the tokenized string, starting at word_stat with length * of numchar, to the dictionary or affix class word that is defined in the * capture group whose info is pointed to by cgnump. * * FIXME: Return int instead of bool, see the comment at E1 below. */ static bool is_word(const char *word_start, int numchar, cgnum_t *cgnump) { Dictionary const dict = cgnump->dict; const char * const afclass = cgnump->afclass; const int lookup_mark_len = (NULL != cgnump->lookup_mark) ? strlen(cgnump->lookup_mark) : 0; char * const word = alloca(numchar+lookup_mark_len+1); #ifdef AFFIX_DICTIONARY_TREE const Dict_node *dn; #endif const Afdict_class *ac; size_t i; /* Append/prepend stem/infix marks. */ if (NULL == cgnump->lookup_mark) { strncpy(word, word_start, numchar); word[numchar] = '\0'; } else { switch (cgnump->lookup_mark_pos) { case 'p': /* prepend a mark */ strcpy(word, cgnump->lookup_mark); strncat(word, word_start, numchar); word[numchar+lookup_mark_len] = '\0'; break; case 'a': /* append a mark */ strncpy(word, word_start, numchar); strcpy(word+numchar, cgnump->lookup_mark); break; default: printf("is_word:E3('%x' %s)", cgnump->lookup_mark_pos, cgnump->lookup_mark); strncpy(word, word_start, numchar); word[numchar] = '\0'; } } lgdebug(7, "LOOKUP '%s' in %s: ", word, dict->name); if (0 == afclass) return boolean_dictionary_lookup(dict, word); /* We don't have for now a tree representation of the affix file, only lists */ #ifdef AFFIX_DICTIONARY_TREE dn = lookup_list(dict, word); printf("WORD %s afclass %s dn %p\n", word, afclass, dn); if (NULL == dn) return false; for (; NULL != dn; dn = dn->left) { const char *con = word_only_connector(dn); if (NULL == con) { /* Internal error - nothing else to do for now unless we don't * rerun bool, but return an int so -1 signifies an error. */ printf("is_word(%s):E1 ", word); } printf("CON '%s'\n", con); if (0 == strcmp(afclass, con)) return true; } #else /* Make it the hard way. */ ac = afdict_find(dict, afclass, /*notify_err*/false); if (NULL == ac) { /* Internal error - nothing else to do for now unless we don't * rerun bool, but return an int so -1 signifies an error. */ printf("is_word(%s):E2 ", word); } for (i = 0; i < ac->length; i++) { if (0 == strcmp(ac->string[i], word)) return true; } #endif return false; }