Ejemplo n.º 1
0
static void load_affix(Dictionary afdict, Dict_node *dn, int l)
{
	Dict_node * dnx = NULL;
	for (; NULL != dn; dn = dnx)
	{
		char *string;
		const char *con = word_only_connector(dn);
		if (NULL == con)
		{
			/* ??? should we support here more than one class? */
			prt_error("Warning: Word \"%s\" found near line %d of %s.\n"
			          "\tWord has more than one connector.\n"
			          "\tThis word will be ignored.",
			          dn->string, afdict->line_number, afdict->name);
			return;
		}

		/* The affix files serve a dual purpose: they indicate both
		 * what a unit is, connector-wise, and what is strippable, as
		 * a string.  When the unit is an 'idiom' (i.e. two words,
		 * e.g. base_pair or degrees_C) then only the first word can
		 * be stripped away from a run-on expression (e.g. "86degrees C")
		 */
		if (contains_underbar(dn->string))
		{
			char *p;
			string = strdup(dn->string);
			p = string+1;
			while (*p != '_' && *p != '\0') p++;
			*p = '\0';
		}
		else
		{
			string = deinflect(dn->string);
		}

		affix_list_add(afdict, afdict_find(afdict, con,
		               /*notify_err*/true), string);
		free(string);

		dnx = dn->left;
		xfree((char *)dn, sizeof(Dict_node));
	}
}
Ejemplo n.º 2
0
/* Was main() of the test program... */
static int regex_split(const char *inpat, int flags, const char *str, Dictionary dict)
{
	const char *p;
	dyn_str *pat;
	int plevel;  /* paren level */
	int cglevel; /* capture group level */
	int nplevel;  /* paren level within named capture group */
	int icgnum;  /* capture group number*/
	int options;
	const char *errptr;
	int erroffset;
	pcre *pcre;
	const char * const prog = "regex_tokenizer_test";
	int rc;
	pcre_extra *extra = NULL;
#define OVCNT 15
	int ovector[OVCNT];
	callout_data_t callout_data;

#if 0
	const char **wordlist;
#endif
	bool word_compare_flag = true;
#ifdef notdef
	dyn_str *wordalts;
#endif
	const char *group_name = NULL;
	char *word_classname;
	char c0[2] = "\0\0";

	/* FIXME: validate we use PCRE version 2 at least. */

	/* Find the number of capturing groups in the input pattern. */
	icgnum = 0;
	for (p = inpat; '\0' != *p; p++)
	{
		/* Count as capture groups only (string) or (?<name>). Especially, avoid
		 * counting (?<=...) (positive look behind) and (?(condition)...) (the
		 * (condition) part).
		 * FIXME: support () inside [].
		 * FIXME: support \. */
		if ((*p == '(') && (*p != '*') &&
		    ((p[1] != '?') || ((p[2] == '<') && (p[3] != '='))) &&
			 ((p-inpat < 2) || (p[-2] != '(') || (p[-1] != '?')))
		{
			icgnum++;
		}
	}
	if (0 == icgnum)
	{
		printf("%s: pattern must include at least one () group (was: %s)\n", prog, inpat);
		return 9;
	}
#if 0
	if (p[-1] != '$')
	{
		/* FIXME: add $ if needed */
		printf("%s: pattern must end with $ (was: %s)\n", prog, inpat);
		return 9;
	}
#endif

	/* Regex syntax check of the pattern.
	 * FIXME: Add support for "(?J)" */
	options = PCRE_UTF8;
	pcre = pcre_compile(inpat, options, &errptr, &erroffset, NULL);
	if (NULL == pcre)
	{
		printf("%s: pcre_compile: Error in pattern '%s' at offset %d: %s\n",
		       prog, inpat, erroffset, errptr);
		return 2;
	}

	callout_data.wordlist = NULL;
	callout_data.cgnum = NULL;
	if (word_compare_flag)
	{
		int i;
#if 0
		callout_data.wordlist = malloc(sizeof(*callout_data.wordlist)*icgnum);
#endif
		callout_data.cgnum = malloc(sizeof(*callout_data.cgnum)*icgnum);
		//printf("ALLOCATED callout_data.cgnum %ld for %d groups\n",
		//sizeof(*callout_data.wordlist)*cgnum, icgnum);
		for (i = 0; i < icgnum; i++)
		{
#if 0
			callout_data.wordlist[i] = NULL;
#endif
			callout_data.cgnum[i] = NULL;

		}
	}

	/* Build the pattern that finds all possible matches. */
	pat = dyn_str_new();
	plevel = 0;
	cglevel = 0;
	icgnum = -1; /* First capture group (plevel==1) is icgnum==0. */

	/* Convert the input regex to the tokenizer regex.
	 * cglevel counts named capture groups
	 * plevel counts all groups
	 *
	 * FIXME: Add support for:
	 * (?x) - comment mode.
	 * (?i) - ignore case.
	 * \ - backslash for ()<>?* .
	 * [] - () inside it
	 * FIXME: Add "(?: ... )" over the result pattern.
	 */
	//dyn_strcat(pat, "(?J)");
	for (p = inpat; '\0' != *p; p++)
	{
		char *re = NULL; /* a regex from the 4.0.regex file */

		switch (*p)
		{
		const char *c;

		case '(':
			if (cglevel > 0)
			{
				printf("Error at position %ld: Tokenizer capture groups cannot have nested groups\n", p-inpat);
			}
			plevel++;
			if ((p[1] == '*') ||
			    ((p[1] == '?') && ((p[2] != '<') || (p[3] == '='))) ||
			    ((p-inpat > 1) && (p[-2] == '(') && (p[-1] == '?')))
			{
				break;
			}
			cglevel++;
			if (cglevel > 1)
			{
				printf("Error at position %ld: Tokenizer aregex cannot have capture group level > 1\n", p-inpat);
				free(callout_data.cgnum);
				return 199;
			}
			icgnum++;
			dyn_strcat(pat, "(?:");
			group_name = NULL;
			break;
		case ')':
			plevel--;
			if (cglevel > 0)
			{
				cglevel--;
				/* Add the dict lookup and capturing callback. */
				dyn_strcat(pat, ")(?C)");
			}
			group_name = NULL;
			break;
		case '<':
			/* Remember it as a potential start of a named group. */
			if ((p-2 >= inpat) && (p[-2] == '(') && (p[-1] == '?') && (p[1]  != '='))
			{
				group_name = p + 1;
			}
			else
				group_name = NULL;
			break;
		case '>':
			if (NULL != group_name)
			{
				/* Check if this is actually a group name */
				for (c = group_name; c < p; c++)
				{
					/* FIXME: 'a' and 'p' are part of a hack for lookup_mark.
					 * FIXME: 'r' is part of a hack for regex names that match affix
					 * class names. The fix is not to use matching names. */
					if ((*c > 'Z' || *c < 'A') && *c != 'a' && *c != 'p' && *c != 'r') break;
				}
				if (c == p)
				{
					word_classname = malloc(p-group_name+1);
					strncpy(word_classname, group_name, p-group_name);
					word_classname[p-group_name] = '\0';
				} else
				{
					printf("%s: Invalid class name in group name found at '%s'\n",
					       prog, group_name-4);
					word_classname = NULL;
				}
			} else
			{
					word_classname = NULL;
			}
			if (!word_classname)
			{
				group_name = NULL;
				break;
			}
			dyn_strcat(pat, ">");

			lgdebug(6, "Found word-class %s\n", word_classname);
#if 0
			wordlist = readwords(word_classname);
			if (NULL == wordlist)
			{
				printf("i%s: Invalid class name %s in group name\n", prog, word_classname);
				return 100;
			}

			if (!word_compare_flag)
			{
				printf("Invocation without -w is not supported\n");
				return 103;
			}
#endif

			if (word_compare_flag)
			{
				char *t;
				const char *lookup_mark = NULL;
#if 0
				callout_data.wordlist[icgnum] = wordlist;
				printf("WORDLIST %p at cgnum %d\n", wordlist, icgnum);
#endif
				/* Allocate per group info  */
				callout_data.cgnum[icgnum] = malloc(sizeof(*(callout_data.cgnum)[0]));
				callout_data.cgnum[icgnum]->name = NULL;
				//printf("ALLOCATED cgnum[%d]=%p\n", icgnum,
				//callout_data.cgnum[icgnum]);

				/* A hack for testing: Handle WORDpX or WORDaX.
				 * The above a/p marks mean append/prepend X to word before making
				 * the lookup.
				 * FIXME: Find another way to specify that, maybe in the affix file
				 * or in a tokenizer definition file. */
				t = strpbrk(word_classname, "pa");
				if (NULL != t)
				{
					Afdict_class *ac;

					callout_data.cgnum[icgnum]->lookup_mark_pos = *t;
					*t = '\0';
					ac = afdict_find(dict->affix_table, t+1, /*notify_err*/false);
					if (NULL == ac)
					{
						printf("%s: Unknown afclass '%s'\n", prog, t+1);
						return 253;
					}

					/* Check if the requested affix class is defined and is not an
					 * empty string (like the default INFIXMARK). */
					if (0 == ac->length || '\0' == ac->string[0][0])
					{
						printf("%s: No value for afclass '%s'\n", prog, t+1);
						return 252;
					}
					lookup_mark = ac->string[0]; /* FIXME: support more than one value. */
				}

				callout_data.cgnum[icgnum]->lookup_mark = lookup_mark;
				callout_data.cgnum[icgnum]->name = word_classname;

				if (0 == strcmp(word_classname, "DICTWORD"))
				{
					/* Assign data for looking up a word in the main dict. */
					callout_data.cgnum[icgnum]->dict = dict;
					callout_data.cgnum[icgnum]->afclass = NULL;
				}
				else
				if (afdict_find(dict->affix_table, word_classname, /*notify_err*/false))
				{
					callout_data.cgnum[icgnum]->dict = dict->affix_table;
					callout_data.cgnum[icgnum]->afclass = word_classname;
				}
				else
				{
					if ('r' == word_classname[0]) word_classname++;
					re = get_regex_by_name(dict, word_classname);
					if (re)
					{
						lgdebug(6, "Regex %s with modified groups: '%s'\n", word_classname, re);
						callout_data.cgnum[icgnum]->dict = NULL;
						/* FIXME: No need to allocate callout_data.cgnum[icgnum] in this
						 * case. */
					}
					else
					{
						printf("%s: Unknown word classname '%s'\n", prog, word_classname);
						return 254;
					}
				}
				/* TODO: Assign flags, e.g. for emitting the words with stem/infix marks. */

			} else
			{
#if 0
				wordalts = make_wordalts(wordlist);
				dyn_strcat(pat, wordalts->str);
				dyn_str_delete(wordalts);
				free(wordlist);
#else
				printf("%s: Invocation without -w is not supported\n", prog);
				return 103;
#endif
			}
			/* Default match for dictionary lookup is ".*".
			 * Allow replacing it by something else.
			 * E.g: .{2,}|a */
			if (')' == p[1])
			{
				if (NULL == re)
				{
					dyn_strcat(pat, ".*");
				}
				else
				{
					dyn_strcat(pat, re);
					free(re);
					re = NULL;
				}
			}
			else
			{
				nplevel = 1;
				/* FIXME: Add support for:
				 * (?x) - comment mode.
				 * \ - backslash for ()<>?* .
				 * [] - () inside it
				 */
				for (; p[1] != '\0' && nplevel > 0; p++)
				{
					switch (p[1])
					{
					case '(':
						if (('?' != p[2]) && ('*' != p[2]) &&
						    ((p[-1] != '(') || (p[0] != '?')))
						{
							printf("%s: Capture_group %d: Nested capture group is not supported\n",
							       prog, icgnum+1);
							return 250;
						}
						nplevel++;
						break;
					case ')':
						nplevel--;
						if (0 == nplevel) continue; /* we are done */
						break;
					}

					c0[0] = p[1];
					dyn_strcat(pat, c0);
				}
				p--;
			}

			word_classname = NULL;
			group_name = NULL;
			continue;
		}

		c0[0] = *p;
		dyn_strcat(pat, c0);
	}

	/* Add '$' at the end if needed. */
	if ('$' != pat->str[pat->end-1]) dyn_strcat(pat, "$");
	/* Add the backtracking callback. */
	dyn_strcat(pat, "(?C1)");

	printf("Modified pattern: %s", pat->str);
	lgdebug(2, " (len %zu/%zu)", pat->end, pat->len);
	printf("\n");

	pcre_callout = callout;

	callout_data.function = 1;
	callout_data.subp_i = 0;
	callout_data.subp[0].s = 0;
	callout_data.subp[0].e = SUBP0END_DEBUG_SIGNATURE;
	callout_data.subp_ovfl = false;
	callout_data.capture_last = 0;
	callout_data.pattern = pat->str;
	callout_data.alt_counter = 0;

	options = PCRE_UTF8;
	pcre = pcre_compile(pat->str, options, &errptr, &erroffset, NULL);
	if (NULL == pcre)
	{
		printf("%s: Internal error: pcre_compile: Error in pattern '%s' at offset %d: %s\n",
		       prog, pat->str, erroffset, errptr);
		return 99;
	}

	/* TODO: Check if using JIT may optimize out some needed callouts. */
	options = 0; //PCRE_STUDY_JIT_COMPILE;
	extra  = pcre_study(pcre, options, &errptr);
	if (NULL == extra)
	{
		if (NULL != errptr)
		{
			printf("%s: pcre_study: Error for pattern '%s': %s\n", prog, pat->str, errptr);
			return 3;
		}
		extra = malloc(sizeof(*extra));
		memset(extra, 0, sizeof(*extra));
	} else
	{
		/* For some reason JIT is sometimes done even though it was not requested.
		 * But the callouts are still invoked as expected in such cases. */
		lgdebug(6, "%s: pcre_study: JIT %ld\n", prog, extra->flags & PCRE_STUDY_JIT_COMPILE);
	}

#if 0
	extra->match_limit = 10000;
	extra->match_limit_recursion = 10000;
	extra->flags |= PCRE_EXTRA_MATCH_LIMIT|PCRE_EXTRA_MATCH_LIMIT_RECURSION;
#endif

	extra->callout_data = (void *)&callout_data;
	extra->flags |= PCRE_EXTRA_CALLOUT_DATA;

#if 0
	printf("CGNUM %d\n", icgnum);
	if (NULL != callout_data.cgnum)
	{
		int i;

		for (i = 0; i <= icgnum; i++)
		{
			printf("callout_data.cgnum[%d] %p\n", i, callout_data.cgnum[i]);
		}
	} else
		printf("CGNUM %p\n", callout_data.cgnum);
#endif

	options = PCRE_ANCHORED; /* XXX Maybe PCRE_NO_START_OPTIMIZE is needed too */
	rc = pcre_exec(pcre, extra, str, strlen(str), 0, options, ovector, OVCNT);
	if (rc < 0)
	{
		if (PCRE_ERROR_NOMATCH == rc)
		{
			lgdebug(2, "No match (must always happen)\n");
		} else
		{
			printf("%s: pcre_exec: Error %d\n", prog, rc);
		}
	} else
	{
		printf("Internal error: Unexpected match, rc=%d\n", rc);
	}

	if (0 == rc)
	{
	  rc = OVCNT/3;
	  printf("ovector only has room for %d captured substrings\n", rc - 1);
	}

	printov(str, (ov_t *)ovector, rc, NULL, /*is_pcreov*/true);

	if (verbosity > 6)
	{
		if (0 != callout_data.subp_i)
		{
			printf("Callout stack:\n");
			printov(str, callout_data.subp, callout_data.subp_i, &callout_data, /*is_pcreov*/false);
		}
	}

	/* Free everything. */
	dyn_str_delete(pat); /* note - callback_data uses parts of pat */
	pcre_free_study(extra); /* safe even if malloc'ed */
	free(pcre);

	if (NULL != callout_data.cgnum)
	{
		int i;

		for (i = 0; i <= icgnum; i++)
		{
			if (callout_data.cgnum[i])
			{
				/* FIXME: Free also word_classname. */
				free(callout_data.cgnum[i]);
			}
		}
		free(callout_data.cgnum);
	}

#if 0
	if (NULL != callout_data.wordlist)
	{
		int i;

		for (i = 0; i < icgnum; i++)
		{
			free(callout_data.wordlist[i]);
		}
		free(callout_data.wordlist);
	}
#endif

	return 0;
}
Ejemplo n.º 3
0
/**
 * Compare a portion of the tokenized string, starting at word_stat with length
 * of numchar, to the dictionary or affix class word that is defined in the
 * capture group whose info is pointed to by cgnump.
 *
 * FIXME: Return int instead of bool, see the comment at E1 below.
 */
static bool is_word(const char *word_start, int numchar, cgnum_t *cgnump)
{
	Dictionary const dict = cgnump->dict;
	const char * const afclass = cgnump->afclass;
	const int lookup_mark_len =
		(NULL != cgnump->lookup_mark) ? strlen(cgnump->lookup_mark) : 0;
	char * const word = alloca(numchar+lookup_mark_len+1);
#ifdef AFFIX_DICTIONARY_TREE
	const Dict_node *dn;
#endif
	const Afdict_class *ac;
	size_t i;

	/* Append/prepend stem/infix marks. */
	if (NULL == cgnump->lookup_mark)
	{
		strncpy(word, word_start, numchar);
		word[numchar] = '\0';
	}
	else
	{
		switch (cgnump->lookup_mark_pos)
		{
		case 'p': /* prepend a mark */
			strcpy(word, cgnump->lookup_mark);
			strncat(word, word_start, numchar);
			word[numchar+lookup_mark_len] = '\0';
			break;
		case 'a': /* append a mark */
			strncpy(word, word_start, numchar);
			strcpy(word+numchar, cgnump->lookup_mark);
			break;
		default:
			printf("is_word:E3('%x' %s)", cgnump->lookup_mark_pos, cgnump->lookup_mark);
			strncpy(word, word_start, numchar);
			word[numchar] = '\0';
		}
	}

	lgdebug(7, "LOOKUP '%s' in %s: ", word, dict->name);
	if (0 == afclass) return boolean_dictionary_lookup(dict, word);

	/* We don't have for now a tree representation of the affix file, only lists */
#ifdef AFFIX_DICTIONARY_TREE
	dn = lookup_list(dict, word);
	printf("WORD %s afclass %s dn %p\n", word, afclass, dn);
	if (NULL == dn) return false;

	for (; NULL != dn; dn = dn->left)
	{
		const char *con = word_only_connector(dn);
		if (NULL == con)
		{
			/* Internal error - nothing else to do for now unless we don't
			 * rerun bool, but return an int so -1 signifies an error. */
			printf("is_word(%s):E1 ", word);
		}
		printf("CON '%s'\n", con);
		if (0 == strcmp(afclass, con)) return true;
	}
#else
		/* Make it the hard way. */
		ac = afdict_find(dict, afclass, /*notify_err*/false);
		if (NULL == ac)
		{
			/* Internal error - nothing else to do for now unless we don't
			 * rerun bool, but return an int so -1 signifies an error. */
			printf("is_word(%s):E2 ", word);
		}

		for (i = 0; i < ac->length; i++)
		{
			if (0 == strcmp(ac->string[i], word)) return true;
		}
#endif

	return false;
}