Beispiel #1
0
/**
 * Create a short form of flags summary for displaying in a word node.
 */
const char *gword_status(Sentence sent, const Gword *w)
{
	dyn_str *s = dyn_str_new();
	const char *r;
	size_t len;

	if (w->status & WS_UNKNOWN)
		dyn_strcat(s, "UNK|");
	if (w->status & WS_INDICT)
		dyn_strcat(s, "IN|");
	if (w->status & WS_REGEX)
		dyn_strcat(s, "RE|");
	if (w->status & WS_SPELL)
		dyn_strcat(s, "SP|");
	if (w->status & WS_RUNON)
		dyn_strcat(s, "RU|");
	if (w->status & WS_HASALT)
		dyn_strcat(s, "HA|");
	if (w->status & WS_UNSPLIT)
		dyn_strcat(s, "UNS|");
	if (w->status & WS_PL)
		dyn_strcat(s, "PL|");

	len = strlen(s->str);
	if (len > 0) s->str[len-1] = '\0';
	r = string_set_add(s->str, sent->string_set);
	dyn_str_delete(s);
	return r;
}
Beispiel #2
0
/**
 * Concatenate the definitions for the given affix class.
 * This allows specifying the characters in different definitions
 * instead in a one long string, e.g. instead of:
 * ""«»《》【】『』`„": QUOTES+;
 * One can specify (note the added spaces):
 * """  «»  《》 【】 『』  ` „: QUOTES+;
 * Or even:
 * """: QUOTES+;
 * «» : QUOTES+;
 * etc.
 * Note that if there are no definitions or only one definition, there is
 * nothing to do.
 * The result is written to the first entry.
 * @param classno The given affix class.
 */
static void concat_class(Dictionary afdict, int classno)
{
	Afdict_class * ac;
	size_t i;
	dyn_str * qs;

	ac = AFCLASS(afdict, classno);
	if (1 >= ac->length) return;

	qs = dyn_str_new();
	for (i = 0; i < ac->length; i++)
		dyn_strcat(qs, ac->string[i]);

	ac->string[0] = string_set_add(qs->str, afdict->string_set);
	dyn_str_delete(qs);
}
Beispiel #3
0
static void
db_lookup_common(Dictionary dict, const char *s,
                 int (*cb)(void *, int, char **, char **),
                 cbdata* bs)
{
	sqlite3 *db = dict->db_handle;
	dyn_str *qry;

	/* The token to look up is called the 'morpheme'. */
	qry = dyn_str_new();
	dyn_strcat(qry, "SELECT subscript, classname FROM Morphemes WHERE morpheme = \'");
	dyn_strcat(qry, s);
	dyn_strcat(qry, "\';");

	sqlite3_exec(db, qry->str, cb, bs, NULL);
	dyn_str_delete(qry);
}
Beispiel #4
0
/**
 * Convert a list of utf8 chars to wide-chars.  The reason for doing
 * this is kind-of dorky: its so that we can easily find,
 * character-by-character, if a given character is a quotation mark
 * or a bullet.  This works only because the quotation marks and
 * bullets are exactly one (wide) character in length. I would like
 * it better if we didn't do this wide-char conversion, since wide-chars
 * are badly-behaved in crazy locales, and on MS Windows.
 */
static bool afdict_to_wide(Dictionary afdict, int classno)
{
	Afdict_class * ac;
	wchar_t * wqs;
	mbstate_t mbs;
	size_t i;
	int w;
	dyn_str * qs;
	const char *pqs;

	ac = AFCLASS(afdict, classno);
	if (0 == ac->length) return true;

	qs = dyn_str_new();
	for (i = 0; i < ac->length; i++)
		dyn_strcat(qs, ac->string[i]);

	/*
	 * Convert utf8 to wide chars before use.
	 * In case of error the result is undefined.
	 */
	pqs = qs->str;
	memset(&mbs, 0, sizeof(mbs));
	w = mbsrtowcs(NULL, &pqs, 0, &mbs);
	if (0 > w)
	{
		prt_error("Error: Affix dictionary: %s: "
		          "Invalid utf8 character\n", afdict_classname[classno]);
		return false;
	}

	/* Store the wide char version at the AFCLASS entry. */
	ac->mem_elems =  sizeof(*wqs) * (w+1); /* bytes here, but we don't care */
	ac->string = malloc(ac->mem_elems);
	wqs = (wchar_t *)ac->string;
	pqs = qs->str;
	(void)mbsrtowcs(wqs, &pqs, w, &mbs);
	wqs[w] = L'\0';

	dyn_str_delete(qs);

	return true;
}
Beispiel #5
0
static void
db_lookup_exp(Dictionary dict, const char *s, cbdata* bs)
{
	sqlite3 *db = dict->db_handle;
	dyn_str *qry;

	/* The token to look up is called the 'morpheme'. */
	qry = dyn_str_new();
	dyn_strcat(qry, "SELECT disjunct, cost FROM Disjuncts WHERE classname = \'");
	dyn_strcat(qry, s);
	dyn_strcat(qry, "\';");

	sqlite3_exec(db, qry->str, exp_cb, bs, NULL);
	dyn_str_delete(qry);

	if (4 < verbosity)
	{
		printf("Found expression for class %s: ", s);
		print_expression(bs->exp);
	}
}
Beispiel #6
0
/**
 * Graph node name: Add "Sentence:" for the main node; Convert SUBSCRIPT_MARK.
 * Also escape " and \ with a \.
 */
static const char *wlabel(Sentence sent, const Gword *w)
{
	const char *s;
	const char sentence_label[] = "Sentence:\\n";
	dyn_str *l = dyn_str_new();
	char c0[] = "\0\0";

	assert((NULL != w) && (NULL != w->subword), "Word must exist");
	if ('\0' == *w->subword)
		 return string_set_add("(nothing)", sent->string_set);

	if (w == sent->wordgraph) dyn_strcat(l, sentence_label);

	for (s = w->subword; *s; s++)
	{
		switch (*s)
		{
			case SUBSCRIPT_MARK:
				dyn_strcat(l, ".");
				break;
			case '\"':
				dyn_strcat(l, "\\\"");
				break;
			case '\\':
				dyn_strcat(l, "\\");
				break;
			default:
				*c0 = *s;
				dyn_strcat(l, c0);
		}
	}

	s = string_set_add(l->str, sent->string_set);
	dyn_str_delete(l);
	return s;
}
Beispiel #7
0
/**
 *  Print the chosen_disjuncts words.
 *  This is used for debug, e.g. for tracking them in the Wordgraph display.
 */
static void print_chosen_disjuncts_words(const Linkage lkg, bool prt_optword)
{
	size_t i;
	dyn_str *djwbuf = dyn_str_new();

	err_msg(lg_Debug, "Linkage %p (%zu words): ", lkg, lkg->num_words);
	for (i = 0; i < lkg->num_words; i++)
	{
		Disjunct *cdj = lkg->chosen_disjuncts[i];
		const char *djw; /* disjunct word - the chosen word */

		if (NULL == cdj)
			djw = (prt_optword && lkg->sent->word[i].optional) ? "{}" : "[]";
		else if ('\0' == cdj->word_string[0])
			djw = "\\0"; /* null string - something is wrong */
		else
			djw = cdj->word_string;

		dyn_strcat(djwbuf, djw);
		dyn_strcat(djwbuf, " ");
	}
	err_msg(lg_Debug, "%s\n", djwbuf->str);
	dyn_str_delete(djwbuf);
}
Beispiel #8
0
static bool afdict_init(Dictionary dict)
{
	Afdict_class * ac;
	Dictionary afdict = dict->affix_table;

	/* FIXME: read_entry() builds word lists in reverse order (can we
	 * just create the list top-down without breaking anything?). Unless
	 * it is fixed to preserve the order, reverse here the word list for
	 * each affix class. */
	for (ac = afdict->afdict_class;
		  ac < &afdict->afdict_class[ARRAY_SIZE(afdict_classname)]; ac++)
	{
		int i;
		int l = ac->length - 1;
		const char * t;

		for (i = 0;  i < l; i++, l--)
		{
			t = ac->string[i];
			ac->string[i] = ac->string[l];
			ac->string[l] = t;
		}
	}

	/* Create the affix lists */
	ac = AFCLASS(afdict, AFDICT_INFIXMARK);
	if ((1 < ac->length) || ((1 == ac->length) && (1 != strlen(ac->string[0]))))
	{
		prt_error("Error: afdict_init: Invalid value for class %s in file %s"
		          " (should have been one ASCII punctuation - ignored)\n",
		          afdict_classname[AFDICT_INFIXMARK], afdict->name);
		free((void *)ac->string);
		ac->length = 0;
		ac->mem_elems = 0;
		ac->string = NULL;
	}
	/* XXX For now there is a possibility to use predefined SUF and PRE lists.
	 * So if SUF or PRE are defined, don't extract any of them from the dict. */
	if (1 == ac->length)
	{
		if ((0 == AFCLASS(afdict, AFDICT_PRE)->length) &&
		    (0 == AFCLASS(afdict, AFDICT_SUF)->length))
		{
			char last_entry[MAX_WORD+1] = "";
			get_dict_affixes(dict, dict->root, ac->string[0][0], last_entry);
		}
	}
	else
	{
		/* No INFIX_MARK - create a dummy one that always mismatches */
		affix_list_add(afdict, &afdict->afdict_class[AFDICT_INFIXMARK], "");
	}

	if (debug_level(+D_AI))
	{
		size_t l;

		for (ac = afdict->afdict_class;
		     ac < &afdict->afdict_class[ARRAY_SIZE(afdict_classname)]; ac++)
		{
				if (0 == ac->length) continue;
				lgdebug(+0, "Class %s, %zd items:",
				        afdict_classname[ac-afdict->afdict_class], ac->length);
				for (l = 0; l < ac->length; l++)
					lgdebug(0, " '%s'", ac->string[l]);
				lgdebug(0, "\n");
		}
	}
#undef D_AI

	/* Store the SANEMORPHISM regex in the unused (up to now)
	 * regex_root element of the affix dictionary, and precompile it */
	assert(NULL == afdict->regex_root, "SM regex is already assigned");
	ac = AFCLASS(afdict, AFDICT_SANEMORPHISM);
	if (0 != ac->length)
	{
		int rc;

		Regex_node *sm_re = malloc(sizeof(*sm_re));
		dyn_str *rebuf = dyn_str_new();

		/* The regex used to be converted to: ^((original-regex)b)+$
		 * In the initial wordgraph version word boundaries are not supported,
		 * so instead it is converted to: ^(original-regex)+$ */
#ifdef WORD_BOUNDARIES
		dyn_strcat(rebuf, "^((");
#else
		dyn_strcat(rebuf, "^(");
#endif
		dyn_strcat(rebuf, ac->string[0]);
#ifdef WORD_BOUNDARIES
		dyn_strcat(rebuf, ")b)+$");
#else
		dyn_strcat(rebuf, ")+$");
#endif
		sm_re->pattern = strdup(rebuf->str);
		dyn_str_delete(rebuf);

		afdict->regex_root = sm_re;
		sm_re->name = strdup(afdict_classname[AFDICT_SANEMORPHISM]);
		sm_re->re = NULL;
		sm_re->next = NULL;
		sm_re->neg = false;
		rc = compile_regexs(afdict->regex_root, afdict);
		if (rc) {
			prt_error("Error: afdict_init: Failed to compile "
			          "regex '%s' in file %s, return code %d\n",
			          afdict_classname[AFDICT_SANEMORPHISM], afdict->name, rc);
			return false;
		}
		lgdebug(+5, "%s regex %s\n",
		        afdict_classname[AFDICT_SANEMORPHISM], sm_re->pattern);
	}

	/* sort the UNITS list */
	/* Longer unit names must get split off before shorter ones.
	 * This prevents single-letter splits from screwing things
	 * up. e.g. split 7gram before 7am before 7m
	 */
	ac = AFCLASS(afdict, AFDICT_UNITS);
	if (0 < ac->length)
	{
		qsort(ac->string, ac->length, sizeof(char *), cmplen);
	}

#ifdef AFDICT_ORDER_NOT_PRESERVED
	/* pre-sort the MPRE list */
	ac = AFCLASS(afdict, AFDICT_MPRE);
	if (0 < ac->length)
	{
		/* Longer subwords have priority over shorter ones,
		 * reverse-sort by length.
		 * XXX mprefix_split() for Hebrew depends on that. */
		qsort(ac->string, ac->length, sizeof(char *), revcmplen);
	}
#endif /* AFDICT_ORDER_NOT_PRESERVED */

	concat_class(afdict, AFDICT_QUOTES);
	concat_class(afdict, AFDICT_BULLETS);

	if (! anysplit_init(afdict)) return false;

	return true;
}
Beispiel #9
0
/* Was main() of the test program... */
static int regex_split(const char *inpat, int flags, const char *str, Dictionary dict)
{
	const char *p;
	dyn_str *pat;
	int plevel;  /* paren level */
	int cglevel; /* capture group level */
	int nplevel;  /* paren level within named capture group */
	int icgnum;  /* capture group number*/
	int options;
	const char *errptr;
	int erroffset;
	pcre *pcre;
	const char * const prog = "regex_tokenizer_test";
	int rc;
	pcre_extra *extra = NULL;
#define OVCNT 15
	int ovector[OVCNT];
	callout_data_t callout_data;

#if 0
	const char **wordlist;
#endif
	bool word_compare_flag = true;
#ifdef notdef
	dyn_str *wordalts;
#endif
	const char *group_name = NULL;
	char *word_classname;
	char c0[2] = "\0\0";

	/* FIXME: validate we use PCRE version 2 at least. */

	/* Find the number of capturing groups in the input pattern. */
	icgnum = 0;
	for (p = inpat; '\0' != *p; p++)
	{
		/* Count as capture groups only (string) or (?<name>). Especially, avoid
		 * counting (?<=...) (positive look behind) and (?(condition)...) (the
		 * (condition) part).
		 * FIXME: support () inside [].
		 * FIXME: support \. */
		if ((*p == '(') && (*p != '*') &&
		    ((p[1] != '?') || ((p[2] == '<') && (p[3] != '='))) &&
			 ((p-inpat < 2) || (p[-2] != '(') || (p[-1] != '?')))
		{
			icgnum++;
		}
	}
	if (0 == icgnum)
	{
		printf("%s: pattern must include at least one () group (was: %s)\n", prog, inpat);
		return 9;
	}
#if 0
	if (p[-1] != '$')
	{
		/* FIXME: add $ if needed */
		printf("%s: pattern must end with $ (was: %s)\n", prog, inpat);
		return 9;
	}
#endif

	/* Regex syntax check of the pattern.
	 * FIXME: Add support for "(?J)" */
	options = PCRE_UTF8;
	pcre = pcre_compile(inpat, options, &errptr, &erroffset, NULL);
	if (NULL == pcre)
	{
		printf("%s: pcre_compile: Error in pattern '%s' at offset %d: %s\n",
		       prog, inpat, erroffset, errptr);
		return 2;
	}

	callout_data.wordlist = NULL;
	callout_data.cgnum = NULL;
	if (word_compare_flag)
	{
		int i;
#if 0
		callout_data.wordlist = malloc(sizeof(*callout_data.wordlist)*icgnum);
#endif
		callout_data.cgnum = malloc(sizeof(*callout_data.cgnum)*icgnum);
		//printf("ALLOCATED callout_data.cgnum %ld for %d groups\n",
		//sizeof(*callout_data.wordlist)*cgnum, icgnum);
		for (i = 0; i < icgnum; i++)
		{
#if 0
			callout_data.wordlist[i] = NULL;
#endif
			callout_data.cgnum[i] = NULL;

		}
	}

	/* Build the pattern that finds all possible matches. */
	pat = dyn_str_new();
	plevel = 0;
	cglevel = 0;
	icgnum = -1; /* First capture group (plevel==1) is icgnum==0. */

	/* Convert the input regex to the tokenizer regex.
	 * cglevel counts named capture groups
	 * plevel counts all groups
	 *
	 * FIXME: Add support for:
	 * (?x) - comment mode.
	 * (?i) - ignore case.
	 * \ - backslash for ()<>?* .
	 * [] - () inside it
	 * FIXME: Add "(?: ... )" over the result pattern.
	 */
	//dyn_strcat(pat, "(?J)");
	for (p = inpat; '\0' != *p; p++)
	{
		char *re = NULL; /* a regex from the 4.0.regex file */

		switch (*p)
		{
		const char *c;

		case '(':
			if (cglevel > 0)
			{
				printf("Error at position %ld: Tokenizer capture groups cannot have nested groups\n", p-inpat);
			}
			plevel++;
			if ((p[1] == '*') ||
			    ((p[1] == '?') && ((p[2] != '<') || (p[3] == '='))) ||
			    ((p-inpat > 1) && (p[-2] == '(') && (p[-1] == '?')))
			{
				break;
			}
			cglevel++;
			if (cglevel > 1)
			{
				printf("Error at position %ld: Tokenizer aregex cannot have capture group level > 1\n", p-inpat);
				free(callout_data.cgnum);
				return 199;
			}
			icgnum++;
			dyn_strcat(pat, "(?:");
			group_name = NULL;
			break;
		case ')':
			plevel--;
			if (cglevel > 0)
			{
				cglevel--;
				/* Add the dict lookup and capturing callback. */
				dyn_strcat(pat, ")(?C)");
			}
			group_name = NULL;
			break;
		case '<':
			/* Remember it as a potential start of a named group. */
			if ((p-2 >= inpat) && (p[-2] == '(') && (p[-1] == '?') && (p[1]  != '='))
			{
				group_name = p + 1;
			}
			else
				group_name = NULL;
			break;
		case '>':
			if (NULL != group_name)
			{
				/* Check if this is actually a group name */
				for (c = group_name; c < p; c++)
				{
					/* FIXME: 'a' and 'p' are part of a hack for lookup_mark.
					 * FIXME: 'r' is part of a hack for regex names that match affix
					 * class names. The fix is not to use matching names. */
					if ((*c > 'Z' || *c < 'A') && *c != 'a' && *c != 'p' && *c != 'r') break;
				}
				if (c == p)
				{
					word_classname = malloc(p-group_name+1);
					strncpy(word_classname, group_name, p-group_name);
					word_classname[p-group_name] = '\0';
				} else
				{
					printf("%s: Invalid class name in group name found at '%s'\n",
					       prog, group_name-4);
					word_classname = NULL;
				}
			} else
			{
					word_classname = NULL;
			}
			if (!word_classname)
			{
				group_name = NULL;
				break;
			}
			dyn_strcat(pat, ">");

			lgdebug(6, "Found word-class %s\n", word_classname);
#if 0
			wordlist = readwords(word_classname);
			if (NULL == wordlist)
			{
				printf("i%s: Invalid class name %s in group name\n", prog, word_classname);
				return 100;
			}

			if (!word_compare_flag)
			{
				printf("Invocation without -w is not supported\n");
				return 103;
			}
#endif

			if (word_compare_flag)
			{
				char *t;
				const char *lookup_mark = NULL;
#if 0
				callout_data.wordlist[icgnum] = wordlist;
				printf("WORDLIST %p at cgnum %d\n", wordlist, icgnum);
#endif
				/* Allocate per group info  */
				callout_data.cgnum[icgnum] = malloc(sizeof(*(callout_data.cgnum)[0]));
				callout_data.cgnum[icgnum]->name = NULL;
				//printf("ALLOCATED cgnum[%d]=%p\n", icgnum,
				//callout_data.cgnum[icgnum]);

				/* A hack for testing: Handle WORDpX or WORDaX.
				 * The above a/p marks mean append/prepend X to word before making
				 * the lookup.
				 * FIXME: Find another way to specify that, maybe in the affix file
				 * or in a tokenizer definition file. */
				t = strpbrk(word_classname, "pa");
				if (NULL != t)
				{
					Afdict_class *ac;

					callout_data.cgnum[icgnum]->lookup_mark_pos = *t;
					*t = '\0';
					ac = afdict_find(dict->affix_table, t+1, /*notify_err*/false);
					if (NULL == ac)
					{
						printf("%s: Unknown afclass '%s'\n", prog, t+1);
						return 253;
					}

					/* Check if the requested affix class is defined and is not an
					 * empty string (like the default INFIXMARK). */
					if (0 == ac->length || '\0' == ac->string[0][0])
					{
						printf("%s: No value for afclass '%s'\n", prog, t+1);
						return 252;
					}
					lookup_mark = ac->string[0]; /* FIXME: support more than one value. */
				}

				callout_data.cgnum[icgnum]->lookup_mark = lookup_mark;
				callout_data.cgnum[icgnum]->name = word_classname;

				if (0 == strcmp(word_classname, "DICTWORD"))
				{
					/* Assign data for looking up a word in the main dict. */
					callout_data.cgnum[icgnum]->dict = dict;
					callout_data.cgnum[icgnum]->afclass = NULL;
				}
				else
				if (afdict_find(dict->affix_table, word_classname, /*notify_err*/false))
				{
					callout_data.cgnum[icgnum]->dict = dict->affix_table;
					callout_data.cgnum[icgnum]->afclass = word_classname;
				}
				else
				{
					if ('r' == word_classname[0]) word_classname++;
					re = get_regex_by_name(dict, word_classname);
					if (re)
					{
						lgdebug(6, "Regex %s with modified groups: '%s'\n", word_classname, re);
						callout_data.cgnum[icgnum]->dict = NULL;
						/* FIXME: No need to allocate callout_data.cgnum[icgnum] in this
						 * case. */
					}
					else
					{
						printf("%s: Unknown word classname '%s'\n", prog, word_classname);
						return 254;
					}
				}
				/* TODO: Assign flags, e.g. for emitting the words with stem/infix marks. */

			} else
			{
#if 0
				wordalts = make_wordalts(wordlist);
				dyn_strcat(pat, wordalts->str);
				dyn_str_delete(wordalts);
				free(wordlist);
#else
				printf("%s: Invocation without -w is not supported\n", prog);
				return 103;
#endif
			}
			/* Default match for dictionary lookup is ".*".
			 * Allow replacing it by something else.
			 * E.g: .{2,}|a */
			if (')' == p[1])
			{
				if (NULL == re)
				{
					dyn_strcat(pat, ".*");
				}
				else
				{
					dyn_strcat(pat, re);
					free(re);
					re = NULL;
				}
			}
			else
			{
				nplevel = 1;
				/* FIXME: Add support for:
				 * (?x) - comment mode.
				 * \ - backslash for ()<>?* .
				 * [] - () inside it
				 */
				for (; p[1] != '\0' && nplevel > 0; p++)
				{
					switch (p[1])
					{
					case '(':
						if (('?' != p[2]) && ('*' != p[2]) &&
						    ((p[-1] != '(') || (p[0] != '?')))
						{
							printf("%s: Capture_group %d: Nested capture group is not supported\n",
							       prog, icgnum+1);
							return 250;
						}
						nplevel++;
						break;
					case ')':
						nplevel--;
						if (0 == nplevel) continue; /* we are done */
						break;
					}

					c0[0] = p[1];
					dyn_strcat(pat, c0);
				}
				p--;
			}

			word_classname = NULL;
			group_name = NULL;
			continue;
		}

		c0[0] = *p;
		dyn_strcat(pat, c0);
	}

	/* Add '$' at the end if needed. */
	if ('$' != pat->str[pat->end-1]) dyn_strcat(pat, "$");
	/* Add the backtracking callback. */
	dyn_strcat(pat, "(?C1)");

	printf("Modified pattern: %s", pat->str);
	lgdebug(2, " (len %zu/%zu)", pat->end, pat->len);
	printf("\n");

	pcre_callout = callout;

	callout_data.function = 1;
	callout_data.subp_i = 0;
	callout_data.subp[0].s = 0;
	callout_data.subp[0].e = SUBP0END_DEBUG_SIGNATURE;
	callout_data.subp_ovfl = false;
	callout_data.capture_last = 0;
	callout_data.pattern = pat->str;
	callout_data.alt_counter = 0;

	options = PCRE_UTF8;
	pcre = pcre_compile(pat->str, options, &errptr, &erroffset, NULL);
	if (NULL == pcre)
	{
		printf("%s: Internal error: pcre_compile: Error in pattern '%s' at offset %d: %s\n",
		       prog, pat->str, erroffset, errptr);
		return 99;
	}

	/* TODO: Check if using JIT may optimize out some needed callouts. */
	options = 0; //PCRE_STUDY_JIT_COMPILE;
	extra  = pcre_study(pcre, options, &errptr);
	if (NULL == extra)
	{
		if (NULL != errptr)
		{
			printf("%s: pcre_study: Error for pattern '%s': %s\n", prog, pat->str, errptr);
			return 3;
		}
		extra = malloc(sizeof(*extra));
		memset(extra, 0, sizeof(*extra));
	} else
	{
		/* For some reason JIT is sometimes done even though it was not requested.
		 * But the callouts are still invoked as expected in such cases. */
		lgdebug(6, "%s: pcre_study: JIT %ld\n", prog, extra->flags & PCRE_STUDY_JIT_COMPILE);
	}

#if 0
	extra->match_limit = 10000;
	extra->match_limit_recursion = 10000;
	extra->flags |= PCRE_EXTRA_MATCH_LIMIT|PCRE_EXTRA_MATCH_LIMIT_RECURSION;
#endif

	extra->callout_data = (void *)&callout_data;
	extra->flags |= PCRE_EXTRA_CALLOUT_DATA;

#if 0
	printf("CGNUM %d\n", icgnum);
	if (NULL != callout_data.cgnum)
	{
		int i;

		for (i = 0; i <= icgnum; i++)
		{
			printf("callout_data.cgnum[%d] %p\n", i, callout_data.cgnum[i]);
		}
	} else
		printf("CGNUM %p\n", callout_data.cgnum);
#endif

	options = PCRE_ANCHORED; /* XXX Maybe PCRE_NO_START_OPTIMIZE is needed too */
	rc = pcre_exec(pcre, extra, str, strlen(str), 0, options, ovector, OVCNT);
	if (rc < 0)
	{
		if (PCRE_ERROR_NOMATCH == rc)
		{
			lgdebug(2, "No match (must always happen)\n");
		} else
		{
			printf("%s: pcre_exec: Error %d\n", prog, rc);
		}
	} else
	{
		printf("Internal error: Unexpected match, rc=%d\n", rc);
	}

	if (0 == rc)
	{
	  rc = OVCNT/3;
	  printf("ovector only has room for %d captured substrings\n", rc - 1);
	}

	printov(str, (ov_t *)ovector, rc, NULL, /*is_pcreov*/true);

	if (verbosity > 6)
	{
		if (0 != callout_data.subp_i)
		{
			printf("Callout stack:\n");
			printov(str, callout_data.subp, callout_data.subp_i, &callout_data, /*is_pcreov*/false);
		}
	}

	/* Free everything. */
	dyn_str_delete(pat); /* note - callback_data uses parts of pat */
	pcre_free_study(extra); /* safe even if malloc'ed */
	free(pcre);

	if (NULL != callout_data.cgnum)
	{
		int i;

		for (i = 0; i <= icgnum; i++)
		{
			if (callout_data.cgnum[i])
			{
				/* FIXME: Free also word_classname. */
				free(callout_data.cgnum[i]);
			}
		}
		free(callout_data.cgnum);
	}

#if 0
	if (NULL != callout_data.wordlist)
	{
		int i;

		for (i = 0; i < icgnum; i++)
		{
			free(callout_data.wordlist[i]);
		}
		free(callout_data.wordlist);
	}
#endif

	return 0;
}
Beispiel #10
0
/**
 * Get a regex (of 4.0.regex) by name.
 * Replace all capturing groups by non-capturing ones, since the invoking
 * function cannot currently handle them. Hence back references are not
 * supported. This can be fixed if needed.
 *
 * If a regex name appears multiple times, concatenate them using an alternation
 * bar. Remove anchors ^ and $ if exist (suppose they can only appear at the
 * start and end of the regex, as currently in 4.0.regex).
 */
static char *get_regex_by_name(Dictionary const dict, const char * const name)
{
	dyn_str * const pat = dyn_str_new();
	char *result = NULL;
	Regex_node *re = dict->regex_root;
	const char *p;

	while (NULL != re)
	{
		if (0 == strcmp(re->name, name))
		{
			/* re analyze state */
			bool insqb = false;        /* in square brackets */
			bool qn = false;           /* quote next character */

			p = re->pattern;
			if ('\0' != pat->str[0]) dyn_strcat(pat, "|");
			if ('^' == *p) p++;

			/* Change groups in POSIX regex to PCRE non-capturing groups.
			 * FIXME: Add support for PCRE syntax,
			 * especially, skip (?...) and (*...).
			 * The following code supports backslash and square brackets.
			 * It supposes the regex is valid. */
			for (; '\0' != *p; p++)
			{
				char c0[2] = "\0\0";

				if (qn)
				{
					qn = false;
				}
				else
				{
					switch (*p)
					{
						case '\\':
							qn = true;
							break;
						case '[':
							insqb = true;
							break;
						case ']':
							if (p > re->pattern && '[' == p[-1]) break;
							insqb = false;
							break;
						case '(':
							if (insqb) break;
							dyn_strcat(pat, "(?:");
							continue;
					}
				}
				if ('$' != *p || '\0' != p[1])
				{
					c0[0] = *p;
					dyn_strcat(pat, c0);
				}
			}
		}
		re = re->next;
	}

	if ('\0' != pat->str[0]) result = strdup(pat->str);
	dyn_str_delete(pat);
	return result;
}