Beispiel #1
0
/**
 * Remove the empty words from a linkage.
 * XXX Should we remove here also the dict-cap tokens? In any case, for now they
 * are left for debug.
 */
static void remove_empty_words(Linkage lkg)
{
	size_t i, j;
	Disjunct **cdj = lkg->chosen_disjuncts;
	int *remap = alloca(lkg->num_words * sizeof(*remap));

	if (4 <= verbosity)
	{
		lgdebug(0, "Info: chosen_disjuncts before removing empty words:\n");
		print_chosen_disjuncts_words(lkg);
	}

	for (i = 0, j = 0; i < lkg->num_words; i++)
	{
		if ((NULL != cdj[i]) && (MT_EMPTY == cdj[i]->word[0]->morpheme_type))
		{
			remap[i] = -1;
		}
		else
		{
			cdj[j] = cdj[i];
			remap[i] = j;
			j++;
		}
	}
	lkg->num_words = j;
	/* Unused memory not freed - all of it will be freed in free_linkages(). */

	if (4 <= verbosity)
	{
		lgdebug(0, "Info: chosen_disjuncts after removing empty words:\n");
		print_chosen_disjuncts_words(lkg);
	}

	for (i = 0, j = 0; i < lkg->num_links; i++)
	{
		const Link *old_lnk = &(lkg->link_array[i]);

		if ((-1 != remap[old_lnk->rw]) && (-1 != remap[old_lnk->lw]))
		{
			Link *new_lnk = &(lkg->link_array[j]);

			/* Copy the entire link contents, thunking the word numbers.
			 * Note that j is always <= i so this is always safe. */
			new_lnk->lw = remap[old_lnk->lw];
			new_lnk->rw = remap[old_lnk->rw];
			new_lnk->lc = old_lnk->lc;
			new_lnk->rc = old_lnk->rc;
			new_lnk->link_name = old_lnk->link_name;
			j++;
		}
	}
	lkg->num_links = j;
	/* Unused memory not freed - all of it will be freed in free_linkages(). */
}
Beispiel #2
0
/**
 * Reuse the given memory pool.
 * Reset the pool pointers without freeing its memory.
 * pool_alloc() will then reuse the existing pool blocks before allocating
 * new blocks.
 */
void pool_reuse(Pool_desc *mp)
{
	lgdebug(+D_MEMPOOL, "Used %zu elements (pool '%s' created in %s())\n",
	        mp->curr_elements, mp->name, mp->func);
	mp->ring = mp->chain;
	mp->alloc_next = mp->ring;
}
Beispiel #3
0
/**
 * Delete the given memory pool.
 */
void pool_delete(Pool_desc *mp)
{
	if (NULL == mp) return;
	lgdebug(+D_MEMPOOL, "Used %zu elements (pool '%s' created in %s())\n",
	        mp->curr_elements, mp->name, mp->func);

	/* Free its chained memory blocks. */
	char *c_next;
	size_t alloc_size;
#if POOL_ALLOCATOR
	alloc_size = mp->data_size;
#else
	alloc_size = mp->element_size;
#endif
	for (char *c = mp->chain; c != NULL; c = c_next)
	{
		c_next = POOL_NEXT_BLOCK(c, alloc_size);
#if POOL_ALLOCATOR
		aligned_free(c);
#else
		free(c);
#endif
	}
	free(mp);
}
Beispiel #4
0
/**
 * Check that the given locale known by the system.
 * In case we don't have locale_t, actually set the locale
 * in order to find out if it is fine. This side effect doesn't cause
 * harm, as the locale would be set up to that value anyway shortly.
 * @param locale Locale string
 * @return True if known, false if unknown.
 */
bool try_locale(const char *locale)
{
#ifdef HAVE_LOCALE_T
		locale_t ltmp = newlocale_LC_CTYPE(locale);
		if ((locale_t)0 == ltmp) return false;
		freelocale(ltmp);
#else
		lgdebug(D_USER_FILES, "Debug: Setting program's locale \"%s\"", locale);
		if (NULL == setlocale(LC_CTYPE, locale))
		{
			lgdebug(D_USER_FILES, " failed!\n");
			return false;
		}
		lgdebug(D_USER_FILES, ".\n");
#endif /* HAVE_LOCALE_T */

		return true;
}
Beispiel #5
0
char * get_default_locale(void)
{
	const char *lc_vars[] = {"LC_ALL", "LC_CTYPE", "LANG", NULL};
	char *ev;
	const char **evname;
	char *locale = NULL;

	for(evname = lc_vars; NULL != *evname; evname++)
	{
		ev = getenv(*evname);
		if ((NULL != ev) && ('\0' != ev[0])) break;
	}
	if (NULL != *evname)
	{
		locale = ev;
		lgdebug(D_USER_FILES, "Debug: Environment locale \"%s=%s\"\n", *evname, ev);
#ifdef _WIN32
		/* If compiled with MSVC/MinGW, we still support running under Cygwin. */
		const char *ostype = getenv("OSTYPE");
		if ((NULL != ostype) && (0 == strcmp(ostype, "cygwin")))
		{
			/* Convert to Windows style locale */
			locale = strdupa(locale);
			locale[strcspn(locale, "_")] = '-';
			locale[strcspn(locale, ".@")] = '\0';
		}
#endif /* _WIN32 */
	}
	else
	{
		lgdebug(D_USER_FILES, "Debug: Environment locale not set\n");
#ifdef _WIN32
		locale = win32_getlocale();
		if (NULL == locale)
			lgdebug(D_USER_FILES, "Debug: Cannot find user default locale\n");
		else
			lgdebug(D_USER_FILES, "Debug: User default locale \"%s\"\n", locale);
		return locale; /* Already strdup'ed */
#endif /* _WIN32 */
	}

	return safe_strdup(locale);
}
Beispiel #6
0
Dictionary dictionary_create_from_db(const char *lang)
{
	char *dbname;
	const char * t;
	Dictionary dict;
	Dict_node *dict_node;

	dict = (Dictionary) xalloc(sizeof(struct Dictionary_s));
	memset(dict, 0, sizeof(struct Dictionary_s));

	/* Language and file-name stuff */
	dict->string_set = string_set_create();
	t = strrchr (lang, '/');
	t = (NULL == t) ? lang : t+1;
	dict->lang = string_set_add(t, dict->string_set);
	lgdebug(D_USER_FILES, "Debug: Language: %s\n", dict->lang);

	/* To disable spell-checking, just set the checker to NULL */
	dict->spell_checker = spellcheck_create(dict->lang);
#if defined HAVE_HUNSPELL || defined HAVE_ASPELL
	if (NULL == dict->spell_checker)
		prt_error("Info: Spell checker disabled.");
#endif
	dict->base_knowledge = NULL;
	dict->hpsg_knowledge = NULL;

	dbname = join_path (lang, "dict.db");
	dict->name = string_set_add(dbname, dict->string_set);
	free(dbname);

	/* Set up the database */
	dict->db_handle = object_open(dict->name, db_open, NULL);

	dict->lookup_list = db_lookup_list;
	dict->free_lookup = db_free_llist;
	dict->lookup = db_lookup;
	dict->close = db_close;

	/* Misc remaining common (generic) dict setup work */
	dict->left_wall_defined  = boolean_dictionary_lookup(dict, LEFT_WALL_WORD);
	dict->right_wall_defined = boolean_dictionary_lookup(dict, RIGHT_WALL_WORD);

	dict->empty_word_defined = boolean_dictionary_lookup(dict, EMPTY_WORD_MARK);

	dict->unknown_word_defined = boolean_dictionary_lookup(dict, UNKNOWN_WORD);
	dict->use_unknown_word = true;

	dict_node = dictionary_lookup_list(dict, UNLIMITED_CONNECTORS_WORD);
	if (dict_node != NULL)
		dict->unlimited_connector_set = connector_set_create(dict_node->exp);

	free_lookup_list(dict, dict_node);

	return dict;
}
Beispiel #7
0
static int new_style_conjunctions(con_context_t *ctxt, Linkage linkage, int numcon_total)
{
#ifdef DEBUG
	int c;
	for (c = 0; c < numcon_total; c++)
	{
		constituent_t *ct = &ctxt->constituent[c];
		lgdebug(3, "ola %d valid=%d %s start=%s lr=%zu %zu\n", c,
			ct->valid, ct->type, ct->start_link, ct->left, ct->right);
	}
#endif
	return numcon_total;
}
Beispiel #8
0
/**
 * Given a word, find its alternative ID.
 * An alternative is identified by a pointer to its first word, which is
 * getting set at the time the alternative is created at
 * issue_word_alternative(). (It could be any unique identifier - for coding
 * convenience it is a pointer.)
 *
 * Return the alternative_id of this alternative.
 */
static Gword *find_alternative(Gword *word)
{
	assert(NULL != word, "find_alternative(NULL)");
	assert(NULL != word->alternative_id, "find_alternative(%s): NULL id",
	       word->subword);

#if 0
	lgdebug(+0, "find_alternative(%s): '%s'\n",
	        word->subword, debug_show_subword(word->alternative_id));
#endif

	return word->alternative_id;
}
Beispiel #9
0
static bool morpheme_match(Sentence sent, const char *word, int l, p_list pl)
{
	Dictionary afdict = sent->dict->affix_table;
	anysplit_params *as = afdict->anysplit;
	int pos = 0;
	int p;
	Regex_node *re;
	char *prefix_string = alloca(l+1);

	lgdebug(+2, "word=%s: ", word);
	for (p = 0; p < as->nparts; p++)
	{
		strncpy(prefix_string, &word[pos], pl[p]-pos);
		prefix_string[pl[p]-pos] = '\0';

		/* For flexibility, REGRPE is matched only to the prefix part,
		 * REGMID only to the middle suffixes, and REGSUF only to the suffix part -
		 * which cannot be the prefix. */
		if (0 == p) re = as->regpre;
		else if (pl[p] == l) re = as->regsuf;
		else re = as->regmid;
		lgdebug(2, "re=%s part%d=%s: ", re->name, p, prefix_string);

		/* A NULL regex always matches */
		if ((NULL != re) && (NULL == match_regex(re ,prefix_string)))
		{
			lgdebug(2, "No match\n");
			return false;
		}

		pos = pl[p];
		if (pos == l) break;
	}

	lgdebug(2, "Match\n");
	return true;
}
void lgdebug_initialize(char* filename){

	// char* logfilename = (char*) xparams_get_param(LGPE_LOG_DIR_VAR,"./");
//	char* logfilename = "./log";
	char* logfilename = ".";
	char af[100];
	memset(af,0,100);

	if(logfilename == NULL){
		lgdebug(DBG_ERROR,"lgdebug: No log file specified \n");
		return;
	}

	strcpy(af,logfilename);
	strcat(af,"/");
	strcat(af,filename);

	//open log file
	g_logfile = fopen(af, "w");
	if(g_logfile == NULL){
		lgdebug(DBG_ERROR,"lgdebug: could not open log file %s: %s\n", af, strerror(errno));
		return;
	}
}
Beispiel #11
0
char * dictionary_get_data_dir(void)
{
	char * data_dir = NULL;

	if (custom_data_dir != NULL) {
		data_dir = safe_strdup(custom_data_dir);
		return data_dir;
	}

#ifdef _WIN32
	/* Dynamically locate invocation directory of our program.
	 * Non-ASCII characters are not supported (files will not be found). */
	char prog_path[MAX_PATH_NAME];

	if (!GetModuleFileNameA(NULL, prog_path, sizeof(prog_path)))
	{
		prt_error("Warning: GetModuleFileName error %d\n", (int)GetLastError());
	}
	else
	{
		if (NULL == prog_path)
		{
			/* Can it happen? */
			prt_error("Warning: GetModuleFileName returned a NULL program path!\n");
		}
		else
		{
			if (!PathRemoveFileSpecA(prog_path))
			{
				prt_error("Warning: Cannot get directory from program path '%s'!\n",
				          prog_path);
			}
			else
			{
				/* Unconvertible characters are marked as '?' */
				const char *unsupported = (NULL != strchr(prog_path, '?')) ?
					" (containing unsupported character)" : "";

				lgdebug(D_USER_FILES, "Debug: Directory of executable: %s%s\n",
				        unsupported, prog_path);
				data_dir = safe_strdup(prog_path);
			}
		}
	}
#endif /* _WIN32 */

	return data_dir;
}
Beispiel #12
0
/*
 * Reuse the given fake memory pool by freeing its memory.
 */
void pool_reuse(Pool_desc *mp)
{
	if (NULL == mp) return;
	lgdebug(+D_MEMPOOL, "Used %zu elements (pool '%s' created in %s())\n",
	        mp->curr_elements, mp->name, mp->func);

	/* Free its chained memory blocks. */
	char *c_next;
	for (char *c = mp->chain; c != NULL; c = c_next)
	{
		c_next = POOL_NEXT_BLOCK(c, mp->element_size);
		free(c);
	}

	mp->chain = NULL;
}
Beispiel #13
0
const char *match_regex(const Regex_node *rn, const char *s)
{
	while (rn != NULL)
	{
		int rc;
		bool nomatch;
		bool match;
		regex_t *re = rn->re;

		/* Make sure the regex has been compiled. */
		assert(re);

#if HAVE_PCRE2_H
		rc = pcre2_match(re->re_code, (PCRE2_SPTR)s,
		                 PCRE2_ZERO_TERMINATED, /*startoffset*/0,
		                 PCRE2_NO_UTF_CHECK, re->re_md, NULL);
		match = (rc >= 0);
		nomatch = (rc == PCRE2_ERROR_NOMATCH);
#else
		rc = regexec(rn->re, s, 0, NULL, /*eflags*/0);
		match = (rc == 0);
		nomatch = (rc == REG_NOMATCH);
#endif
		if (match)
		{
			lgdebug(+D_MRE, "%s%s %s\n", &"!"[!rn->neg], rn->name, s);
			if (!rn->neg)
				return rn->name; /* Match found - return--no multiple matches. */

			/* Negative match - skip this regex name. */
			for (const char *nre_name = rn->name; rn->next != NULL; rn = rn->next)
			{
				if (strcmp(nre_name, rn->next->name) != 0) break;
			}
		}
		else if (!nomatch)
		{
			/* We have an error. */
			prt_regerror("Regex matching error", rn, rc, -1);
		}

		rn = rn->next;
	}
	return NULL; /* No matches. */
}
const char *match_regex(const Regex_node *re, const char *s)
{
    int rc;
    const char *nre_name;

    while (re != NULL)
    {
        if (re->re == NULL)
        {
            /* Re not compiled; if this happens, it's likely an
             *  internal error, but nevermind for now.  */
            continue;
        }
        /* Try to match with no extra data (NULL), whole str (0 to strlen(s)),
         * and default options (second 0). */
        /* int rc = pcre_exec(re->re, NULL, s, strlen(s), 0,
         *                    0, ovector, PCRE_OVEC_SIZE); */

        rc = regexec((regex_t*) re->re, s, 0, NULL, 0);
        if (0 == rc)
        {

            lgdebug(+D_MRE, "%s%s %s\n", &"!"[!re->neg], re->name, s);
            if (!re->neg)
                return re->name; /* Match found - return--no multiple matches. */

            /* Negative match - skip this regex name. */
            for (nre_name = re->name; re->next != NULL; re = re->next)
            {
                if (strcmp(nre_name, re->next->name) != 0) break;
            }
        }
        else if (rc != REG_NOMATCH)
        {
            /* We have an error. */
            prt_regerror("Regex matching error", re, rc);
        }
        re = re->next;
    }
    return NULL; /* No matches. */
}
Beispiel #15
0
/**
 * Create a memory pool descriptor.
 * 1. If required, set the allocation size to a power of 2 of the element size.
 * 2. Save the given parameters in the pool descriptor, to be used by
 *    pool_alloc();
 * 3. Chain the pool descriptor to the given pool_list, so it can be
 *    automatically freed.
 */
Pool_desc *pool_new(const char *func, const char *name,
                    size_t num_elements, size_t element_size,
                    bool zero_out, bool align, bool exact)
{
	Pool_desc *mp = malloc(sizeof(Pool_desc));

	mp->func = func;
	mp->name = name;

	if (align)
	{
		mp->element_size = align_size(element_size);
		mp->alignment = MAX(MIN_ALIGNMENT, mp->element_size);
		mp->alignment = MIN(MAX_ALIGNMENT, mp->alignment);
		mp->data_size = num_elements * mp->element_size;
		mp->block_size = ALIGN(mp->data_size + FLDSIZE_NEXT, mp->alignment);
	}
	else
	{
		mp->element_size = element_size;
		mp->alignment = MIN_ALIGNMENT;
		mp->data_size = num_elements * mp->element_size;
		mp->block_size = mp->data_size + FLDSIZE_NEXT;
	}

	mp->zero_out = zero_out;
	mp->exact = exact;
	mp->alloc_next = NULL;
	mp->chain = NULL;
	mp->ring = NULL;
	mp->free_list = NULL;
	mp->curr_elements = 0;
	mp->num_elements = num_elements;

	lgdebug(+D_MEMPOOL, "%sElement size %zu, alignment %zu (pool '%s' created in %s())\n",
	        POOL_ALLOCATOR?"":"(Fake pool allocator) ",
	        mp->element_size, mp->alignment, mp->name, mp->func);
	return mp;
}
Beispiel #16
0
/**
 * Compare a portion of the tokenized string, starting at word_stat with length
 * of numchar, to the dictionary or affix class word that is defined in the
 * capture group whose info is pointed to by cgnump.
 *
 * FIXME: Return int instead of bool, see the comment at E1 below.
 */
static bool is_word(const char *word_start, int numchar, cgnum_t *cgnump)
{
	Dictionary const dict = cgnump->dict;
	const char * const afclass = cgnump->afclass;
	const int lookup_mark_len =
		(NULL != cgnump->lookup_mark) ? strlen(cgnump->lookup_mark) : 0;
	char * const word = alloca(numchar+lookup_mark_len+1);
#ifdef AFFIX_DICTIONARY_TREE
	const Dict_node *dn;
#endif
	const Afdict_class *ac;
	size_t i;

	/* Append/prepend stem/infix marks. */
	if (NULL == cgnump->lookup_mark)
	{
		strncpy(word, word_start, numchar);
		word[numchar] = '\0';
	}
	else
	{
		switch (cgnump->lookup_mark_pos)
		{
		case 'p': /* prepend a mark */
			strcpy(word, cgnump->lookup_mark);
			strncat(word, word_start, numchar);
			word[numchar+lookup_mark_len] = '\0';
			break;
		case 'a': /* append a mark */
			strncpy(word, word_start, numchar);
			strcpy(word+numchar, cgnump->lookup_mark);
			break;
		default:
			printf("is_word:E3('%x' %s)", cgnump->lookup_mark_pos, cgnump->lookup_mark);
			strncpy(word, word_start, numchar);
			word[numchar] = '\0';
		}
	}

	lgdebug(7, "LOOKUP '%s' in %s: ", word, dict->name);
	if (0 == afclass) return boolean_dictionary_lookup(dict, word);

	/* We don't have for now a tree representation of the affix file, only lists */
#ifdef AFFIX_DICTIONARY_TREE
	dn = lookup_list(dict, word);
	printf("WORD %s afclass %s dn %p\n", word, afclass, dn);
	if (NULL == dn) return false;

	for (; NULL != dn; dn = dn->left)
	{
		const char *con = word_only_connector(dn);
		if (NULL == con)
		{
			/* Internal error - nothing else to do for now unless we don't
			 * rerun bool, but return an int so -1 signifies an error. */
			printf("is_word(%s):E1 ", word);
		}
		printf("CON '%s'\n", con);
		if (0 == strcmp(afclass, con)) return true;
	}
#else
		/* Make it the hard way. */
		ac = afdict_find(dict, afclass, /*notify_err*/false);
		if (NULL == ac)
		{
			/* Internal error - nothing else to do for now unless we don't
			 * rerun bool, but return an int so -1 signifies an error. */
			printf("is_word(%s):E2 ", word);
		}

		for (i = 0; i < ac->length; i++)
		{
			if (0 == strcmp(ac->string[i], word)) return true;
		}
#endif

	return false;
}
Beispiel #17
0
/** The return value is the number of disjuncts deleted.
 *  Implementation notes:
 *  Normally all the identical disjunct-jets are memory shared.
 *  The suffix_id of each connector serves as its reference count
 *  in the power table. Each time when a connector that cannot match
 *  is discovered, its reference count is decreased, and its
 *  nearest_word field is assigned BAD_WORD. Due to the memory sharing,
 *  each such an assignment affects immediately all the identical
 *  disjunct-jets.
 *  */
static int power_prune(Sentence sent, Parse_Options opts)
{
	power_table pt;
	prune_context pc;
	int N_deleted[2] = {0}; /* [0] counts first deletions, [1] counts dups. */
	int total_deleted = 0;

	power_table_alloc(sent, &pt);
	power_table_init(sent, &pt);

	pc.pt = &pt;
	pc.power_cost = 0;
	pc.null_links = (opts->min_null_count > 0);
	pc.N_changed = 1;  /* forces it always to make at least two passes */
	pc.sent = sent;

	while (1)
	{
		/* left-to-right pass */
		for (WordIdx w = 0; w < sent->length; w++)
		{
			for (Disjunct **dd = &sent->word[w].d; *dd != NULL; /* See: NEXT */)
			{
				Disjunct *d = *dd; /* just for convenience */
				if (d->left == NULL)
				{
					dd = &d->next;  /* NEXT */
					continue;
				}

				bool is_bad = d->left->nearest_word == BAD_WORD;

				if (is_bad || left_connector_list_update(&pc, d->left, w, true) < 0)
				{
					mark_connector_sequence_for_dequeue(d->left, true);
					mark_connector_sequence_for_dequeue(d->right, false);

					/* discard the current disjunct */
					*dd = d->next; /* NEXT - set current disjunct to the next one */
					N_deleted[(int)is_bad]++;
					continue;
				}

				dd = &d->next; /* NEXT */
			}

			clean_table(pt.r_table_size[w], pt.r_table[w]);
		}

		total_deleted += N_deleted[0] + N_deleted[1];
		lgdebug(D_PRUNE, "Debug: l->r pass changed %d and deleted %d (%d+%d)\n",
		        pc.N_changed, N_deleted[0]+N_deleted[1], N_deleted[0], N_deleted[1]);

		if (pc.N_changed == 0 && N_deleted[0] == 0 && N_deleted[1] == 0) break;
		pc.N_changed = N_deleted[0] = N_deleted[1] = 0;

		/* right-to-left pass */
		for (WordIdx w = sent->length-1; w != (WordIdx) -1; w--)
		{
			for (Disjunct **dd = &sent->word[w].d; *dd != NULL; /* See: NEXT */)
			{
				Disjunct *d = *dd; /* just for convenience */
				if (d->right == NULL)
				{
					dd = &d->next;  /* NEXT */
					continue;
				}

				bool is_bad = d->right->nearest_word == BAD_WORD;

				if (is_bad || right_connector_list_update(&pc, d->right, w, true) >= sent->length)
				{
					mark_connector_sequence_for_dequeue(d->right, true);
					mark_connector_sequence_for_dequeue(d->left, false);

					/* Discard the current disjunct. */
					*dd = d->next; /* NEXT - set current disjunct to the next one */
					N_deleted[(int)is_bad]++;
					continue;
				}

				dd = &d->next; /* NEXT */
			}

			clean_table(pt.l_table_size[w], pt.l_table[w]);
		}

		total_deleted += N_deleted[0] + N_deleted[1];
		lgdebug(D_PRUNE, "Debug: r->l pass changed %d and deleted %d (%d+%d)\n",
		        pc.N_changed, N_deleted[0]+N_deleted[1], N_deleted[0], N_deleted[1]);

		if (pc.N_changed == 0 && N_deleted[0] == 0 && N_deleted[1] == 0) break;
		pc.N_changed = N_deleted[0] = N_deleted[1] = 0;
	}
	power_table_delete(&pt);

	lgdebug(D_PRUNE, "Debug: power prune cost: %d\n", pc.power_cost);

	print_time(opts, "power pruned");
	if (verbosity_level(D_PRUNE))
	{
		prt_error("\n\\");
		prt_error("Debug: After power_pruning:\n\\");
		print_disjunct_counts(sent);
	}

#ifdef DEBUG
	for (WordIdx w = 0; w < sent->length; w++)
	{
		for (Disjunct *d = sent->word[w].d; NULL != d; d = d->next)
		{
			for (Connector *c = d->left; NULL != c; c = c->next)
				assert(c->nearest_word != BAD_WORD);
			for (Connector *c = d->right; NULL != c; c = c->next)
				assert(c->nearest_word != BAD_WORD);
		}
	}
#endif

	return total_deleted;
}
Beispiel #18
0
void WordTag::insert_connectors(Exp* exp, int& dfs_position,
                                bool& leading_right, bool& leading_left,
                                std::vector<int>& eps_right,
                                std::vector<int>& eps_left,
                                char* var, bool root, double parent_cost,
                                Exp* parent_exp, const X_node *word_xnode)
{
  double cost = parent_cost + exp->cost;

#ifdef DEBUG
  if (0 && verbosity_level(+D_IC)) { // Extreme debug
    printf("Expression type %d for Word%d, var %s:\n", exp->type, _word, var);
    printf("parent_exp: "); print_expression(parent_exp);
    printf("exp: "); print_expression(exp);
  }
#endif

  if (exp->type == CONNECTOR_type) {
    dfs_position++;

    Connector connector;
    connector.multi = exp->multi;
    connector.desc = exp->u.condesc;
    set_connector_length_limit(&connector, _opts);

    switch (exp->dir) {
    case '+':
      _position.push_back(_right_connectors.size());
      _dir.push_back('+');
      _right_connectors.push_back(
           PositionConnector(parent_exp, &connector, '+', _word, dfs_position,
                             exp->cost, cost, leading_right, false,
                             eps_right, eps_left, word_xnode));
      leading_right = false;
      break;
    case '-':
      _position.push_back(_left_connectors.size());
      _dir.push_back('-');
      _left_connectors.push_back(
           PositionConnector(parent_exp, &connector, '-', _word, dfs_position,
                             exp->cost, cost, false, leading_left,
                             eps_right, eps_left, word_xnode));
      leading_left = false;
      break;
    default:
      throw std::string("Unknown connector direction: ") + exp->dir;
    }
  } else if (exp->type == AND_type) {
    if (exp->u.l == NULL) {
      /* zeroary and */
    } else
      if (exp->u.l != NULL && exp->u.l->next == NULL) {
        /* unary and - skip */
        insert_connectors(exp->u.l->e, dfs_position, leading_right,
             leading_left, eps_right, eps_left, var, root, cost, parent_exp, word_xnode);
      } else {
        int i;
        E_list* l;

        char new_var[MAX_VARIABLE_NAME];
        char* last_new_var = new_var;
        char* last_var = var;
        while ((*last_new_var = *last_var)) {
          last_new_var++;
          last_var++;
        }

        for (i = 0, l = exp->u.l; l != NULL; l = l->next, i++) {
          char* s = last_new_var;
          *s++ = 'c';
          fast_sprintf(s, i);

          insert_connectors(l->e, dfs_position, leading_right, leading_left,
                eps_right, eps_left, new_var, false, cost, parent_exp, word_xnode);

#ifdef POWER_PRUNE_CONNECTORS
          if (leading_right) {
            eps_right.push_back(_variables->epsilon(new_var, '+'));
          }
          if (leading_left) {
            eps_left.push_back(_variables->epsilon(new_var, '-'));
          }
#endif
        }
      }
  } else if (exp->type == OR_type) {
    if (exp->u.l != NULL && exp->u.l->next == NULL) {
      /* unary or - skip */
      insert_connectors(exp->u.l->e, dfs_position, leading_right, leading_left,
          eps_right, eps_left, var, root, cost, exp->u.l->e, word_xnode);
    } else {
      int i;
      E_list* l;
      bool ll_true = false;
      bool lr_true = false;

      char new_var[MAX_VARIABLE_NAME];
      char* last_new_var = new_var;
      char* last_var = var;
      while ((*last_new_var = *last_var)) {
        last_new_var++;
        last_var++;
      }

#ifdef DEBUG
      if (0 && verbosity_level(+D_IC)) { // Extreme debug
        printf("Word%d, var %s OR_type:\n", _word, var);
        printf("exp mem: "); prt_exp_mem(exp, 0);
      }
#endif

      for (i = 0, l = exp->u.l; l != NULL; l = l->next, i++) {
        bool lr = leading_right, ll = leading_left;
        std::vector<int> er = eps_right, el = eps_left;

        char* s = last_new_var;
        *s++ = 'd';
        fast_sprintf(s, i);

        lgdebug(+D_IC, "Word%d: var: %s; exp%d=%p; X_node: %s\n",
                _word, var, i, l, word_xnode ? word_xnode->word->subword : "NULL X_node");
        assert(word_xnode != NULL, "NULL X_node for var %s", new_var);
        if (root && parent_exp == NULL && l->e != word_xnode->exp) {
          E_list *we = NULL;

          if (word_xnode->exp->type == OR_type) {
            for (we = word_xnode->exp->u.l; we != NULL; we = we-> next) {
              if (l->e == we->e)
                break;
            }
          }
          if (we == NULL && word_xnode->next != NULL) {
            lgdebug(+D_IC, "Next word_xnode for word %d is needed\n", _word);
            word_xnode = word_xnode->next;
          }
        }
        insert_connectors(l->e, dfs_position, lr, ll, er, el, new_var, false, cost, l->e, word_xnode);

        if (lr)
          lr_true = true;
        if (ll)
          ll_true = true;
      }
      leading_right = lr_true;
      leading_left = ll_true;
    }
  }
}
Beispiel #19
0
static Dictionary
dictionary_six_str(const char * lang,
                   const char * input,
                   const char * dict_name,
                   const char * pp_name, const char * cons_name,
                   const char * affix_name, const char * regex_name)
{
	const char * t;
	Dictionary dict;
	Dict_node *dict_node;

	dict = (Dictionary) xalloc(sizeof(struct Dictionary_s));
	memset(dict, 0, sizeof(struct Dictionary_s));

	/* Language and file-name stuff */
	dict->string_set = string_set_create();
	t = strrchr (lang, '/');
	t = (NULL == t) ? lang : t+1;
	dict->lang = string_set_add(t, dict->string_set);
	lgdebug(D_USER_FILES, "Debug: Language: %s\n", dict->lang);
	dict->name = string_set_add(dict_name, dict->string_set);

	/*
	 * A special setup per dictionary type. The check here assumes the affix
	 * dictionary name contains "affix". FIXME: For not using this
	 * assumption, the dictionary creating stuff needs a rearrangement.
	 */
	if (0 == strstr(dict->name, "affix"))
	{
		/* To disable spell-checking, just set the checker to NULL */
		dict->spell_checker = spellcheck_create(dict->lang);
#if defined HAVE_HUNSPELL || defined HAVE_ASPELL
		/* TODO:
		 * 1. Set the spell option to 0, to signify no spell checking is done.
		 * 2. On verbosity >= 1, add a detailed message on the reason. */
		if (NULL == dict->spell_checker)
			prt_error("Info: Spell checker disabled.");
#endif
		dict->insert_entry = insert_list;

		dict->lookup_list = lookup_list;
		dict->free_lookup = free_llist;
		dict->lookup = boolean_lookup;
	}
	else
	{
		/*
		 * Affix dictionary.
		 */
		size_t i;

		dict->insert_entry = load_affix;
		dict->lookup = return_true;

		/* initialize the class table */
		dict->afdict_class =
		   malloc(sizeof(*dict->afdict_class) * ARRAY_SIZE(afdict_classname));
		for (i = 0; i < ARRAY_SIZE(afdict_classname); i++)
		{
			dict->afdict_class[i].mem_elems = 0;
			dict->afdict_class[i].length = 0;
			dict->afdict_class[i].string = NULL;
		}
	}
	dict->affix_table = NULL;

	/* Read dictionary from the input string. */
	dict->input = input;
	dict->pin = dict->input;
	if (!read_dictionary(dict))
	{
		dict->pin = NULL;
		dict->input = NULL;
		goto failure;
	}
	dict->pin = NULL;
	dict->input = NULL;

	if (NULL == affix_name)
	{
		/*
		 * The affix table is handled alone in this invocation.
		 * Skip the rest of processing!
		 * FIXME: The dictionary creating stuff needs a rearrangement.
		 */
		return dict;
	}

	/* If we don't have a locale per dictionary, the following
	 * will also set the program's locale. */
	dict->locale = linkgrammar_get_dict_locale(dict);
	set_utf8_program_locale();

#ifdef HAVE_LOCALE_T
	/* We have a locale per dictionary. */
	if (NULL != dict->locale)
		dict->locale_t = newlocale_LC_CTYPE(dict->locale);

	/* If we didn't succeed to set the dictionary locale, the program will
	 * SEGFAULT when it tries to use it with the isw*() functions.
	 * So set it to the current program's locale as a last resort. */
	if (NULL == dict->locale)
	{
		dict->locale = setlocale(LC_CTYPE, NULL);
		dict->locale_t = newlocale_LC_CTYPE(setlocale(LC_CTYPE, NULL));
		prt_error("Warning: Couldn't set dictionary locale! "
		          "Using current program locale %s", dict->locale);
	}
	/* If dict->locale is still not set, there is a bug. */
	assert((locale_t)0 != dict->locale_t, "Dictionary locale is not set.");
#else
	/* We don't have a locale per dictionary - but anyway make sure
	 * dict->locale is consistent with the current program's locale,
	 * and especially that it is not NULL. It still indicates the intended
	 * locale of this dictionary and the locale of the compiled regexs. */
	dict->locale = setlocale(LC_CTYPE, NULL);
#endif /* HAVE_LOCALE_T */

	dict->affix_table = dictionary_six(lang, affix_name, NULL, NULL, NULL, NULL);
	if (dict->affix_table == NULL)
	{
		prt_error("Error: Could not open affix file %s", affix_name);
		goto failure;
	}
	if (! afdict_init(dict))
		goto failure;

	/*
	 * Process the regex file.
	 * We have to compile regexs using the dictionary locale,
	 * so make a temporary locale swap.
	 */
	if (read_regex_file(dict, regex_name)) goto failure;

	const char *locale = setlocale(LC_CTYPE, NULL);
	locale = strdupa(locale); /* setlocale() uses static memory. */
	setlocale(LC_CTYPE, dict->locale);
	lgdebug(+D_DICT, "Regexs locale %s\n", setlocale(LC_CTYPE, NULL));

	if (compile_regexs(dict->regex_root, dict))
	{
		locale = setlocale(LC_CTYPE, locale);
		goto failure;
	}
	locale = setlocale(LC_CTYPE, locale);
	assert(NULL != locale, "Cannot restore program locale\n");

#ifdef USE_CORPUS
	dict->corpus = lg_corpus_new();
#endif

	dict->left_wall_defined  = boolean_dictionary_lookup(dict, LEFT_WALL_WORD);
	dict->right_wall_defined = boolean_dictionary_lookup(dict, RIGHT_WALL_WORD);

	dict->empty_word_defined = boolean_dictionary_lookup(dict, EMPTY_WORD_MARK);

	dict->base_knowledge  = pp_knowledge_open(pp_name);
	dict->hpsg_knowledge  = pp_knowledge_open(cons_name);

	dict->unknown_word_defined = boolean_dictionary_lookup(dict, UNKNOWN_WORD);
	dict->use_unknown_word = true;

	dict_node = dictionary_lookup_list(dict, UNLIMITED_CONNECTORS_WORD);
	if (dict_node != NULL)
		dict->unlimited_connector_set = connector_set_create(dict_node->exp);

	free_lookup(dict_node);

	return dict;

failure:
	string_set_delete(dict->string_set);
	if (dict->affix_table) xfree(dict->affix_table, sizeof(struct Dictionary_s));
	xfree(dict, sizeof(struct Dictionary_s));
	return NULL;
}
Beispiel #20
0
static bool afdict_init(Dictionary dict)
{
	Afdict_class * ac;
	Dictionary afdict = dict->affix_table;

	/* FIXME: read_entry() builds word lists in reverse order (can we
	 * just create the list top-down without breaking anything?). Unless
	 * it is fixed to preserve the order, reverse here the word list for
	 * each affix class. */
	for (ac = afdict->afdict_class;
		  ac < &afdict->afdict_class[ARRAY_SIZE(afdict_classname)]; ac++)
	{
		int i;
		int l = ac->length - 1;
		const char * t;

		for (i = 0;  i < l; i++, l--)
		{
			t = ac->string[i];
			ac->string[i] = ac->string[l];
			ac->string[l] = t;
		}
	}

	/* Create the affix lists */
	ac = AFCLASS(afdict, AFDICT_INFIXMARK);
	if ((1 < ac->length) || ((1 == ac->length) && (1 != strlen(ac->string[0]))))
	{
		prt_error("Error: afdict_init: Invalid value for class %s in file %s"
		          " (should have been one ASCII punctuation - ignored)\n",
		          afdict_classname[AFDICT_INFIXMARK], afdict->name);
		free((void *)ac->string);
		ac->length = 0;
		ac->mem_elems = 0;
		ac->string = NULL;
	}
	/* XXX For now there is a possibility to use predefined SUF and PRE lists.
	 * So if SUF or PRE are defined, don't extract any of them from the dict. */
	if (1 == ac->length)
	{
		if ((0 == AFCLASS(afdict, AFDICT_PRE)->length) &&
		    (0 == AFCLASS(afdict, AFDICT_SUF)->length))
		{
			char last_entry[MAX_WORD+1] = "";
			get_dict_affixes(dict, dict->root, ac->string[0][0], last_entry);
		}
	}
	else
	{
		/* No INFIX_MARK - create a dummy one that always mismatches */
		affix_list_add(afdict, &afdict->afdict_class[AFDICT_INFIXMARK], "");
	}

	if (debug_level(+D_AI))
	{
		size_t l;

		for (ac = afdict->afdict_class;
		     ac < &afdict->afdict_class[ARRAY_SIZE(afdict_classname)]; ac++)
		{
				if (0 == ac->length) continue;
				lgdebug(+0, "Class %s, %zd items:",
				        afdict_classname[ac-afdict->afdict_class], ac->length);
				for (l = 0; l < ac->length; l++)
					lgdebug(0, " '%s'", ac->string[l]);
				lgdebug(0, "\n");
		}
	}
#undef D_AI

	/* Store the SANEMORPHISM regex in the unused (up to now)
	 * regex_root element of the affix dictionary, and precompile it */
	assert(NULL == afdict->regex_root, "SM regex is already assigned");
	ac = AFCLASS(afdict, AFDICT_SANEMORPHISM);
	if (0 != ac->length)
	{
		int rc;

		Regex_node *sm_re = malloc(sizeof(*sm_re));
		dyn_str *rebuf = dyn_str_new();

		/* The regex used to be converted to: ^((original-regex)b)+$
		 * In the initial wordgraph version word boundaries are not supported,
		 * so instead it is converted to: ^(original-regex)+$ */
#ifdef WORD_BOUNDARIES
		dyn_strcat(rebuf, "^((");
#else
		dyn_strcat(rebuf, "^(");
#endif
		dyn_strcat(rebuf, ac->string[0]);
#ifdef WORD_BOUNDARIES
		dyn_strcat(rebuf, ")b)+$");
#else
		dyn_strcat(rebuf, ")+$");
#endif
		sm_re->pattern = strdup(rebuf->str);
		dyn_str_delete(rebuf);

		afdict->regex_root = sm_re;
		sm_re->name = strdup(afdict_classname[AFDICT_SANEMORPHISM]);
		sm_re->re = NULL;
		sm_re->next = NULL;
		sm_re->neg = false;
		rc = compile_regexs(afdict->regex_root, afdict);
		if (rc) {
			prt_error("Error: afdict_init: Failed to compile "
			          "regex '%s' in file %s, return code %d\n",
			          afdict_classname[AFDICT_SANEMORPHISM], afdict->name, rc);
			return false;
		}
		lgdebug(+5, "%s regex %s\n",
		        afdict_classname[AFDICT_SANEMORPHISM], sm_re->pattern);
	}

	/* sort the UNITS list */
	/* Longer unit names must get split off before shorter ones.
	 * This prevents single-letter splits from screwing things
	 * up. e.g. split 7gram before 7am before 7m
	 */
	ac = AFCLASS(afdict, AFDICT_UNITS);
	if (0 < ac->length)
	{
		qsort(ac->string, ac->length, sizeof(char *), cmplen);
	}

#ifdef AFDICT_ORDER_NOT_PRESERVED
	/* pre-sort the MPRE list */
	ac = AFCLASS(afdict, AFDICT_MPRE);
	if (0 < ac->length)
	{
		/* Longer subwords have priority over shorter ones,
		 * reverse-sort by length.
		 * XXX mprefix_split() for Hebrew depends on that. */
		qsort(ac->string, ac->length, sizeof(char *), revcmplen);
	}
#endif /* AFDICT_ORDER_NOT_PRESERVED */

	concat_class(afdict, AFDICT_QUOTES);
	concat_class(afdict, AFDICT_BULLETS);

	if (! anysplit_init(afdict)) return false;

	return true;
}
Beispiel #21
0
void compute_chosen_words(Sentence sent, Linkage linkage, Parse_Options opts)
{
	WordIdx i;   /* index of chosen_words */
	WordIdx j;
	Disjunct **cdjp = linkage->chosen_disjuncts;
	const char **chosen_words = alloca(linkage->num_words * sizeof(*chosen_words));
	int *remap = alloca(linkage->num_words * sizeof(*remap));
	bool *show_word = alloca(linkage->num_words * sizeof(*show_word));
	bool display_morphology = opts->display_morphology;

	Gword **lwg_path = linkage->wg_path;
	Gword **n_lwg_path = NULL; /* new Wordgraph path, to match chosen_words */

	Gword **nullblock_start = NULL; /* start of a null block, to be put in [] */
	size_t nbsize = 0;              /* number of word in a null block */
	Gword *sentence_word;

	memset(show_word, 0, linkage->num_words * sizeof(*show_word));

	if (verbosity_level(D_CCW))
		print_lwg_path(lwg_path, "Linkage");

	for (i = 0; i < linkage->num_words; i++)
	{
		Disjunct *cdj = cdjp[i];
		Gword *w;              /* current word */
		const Gword *nw;       /* next word (NULL if none) */
		Gword **wgp;           /* wordgraph_path traversing pointer */

		const char *t = NULL;  /* current word string */
		bool at_nullblock_end; /* current word is at end of a nullblock */
		bool join_alt = false; /* morpheme-join this alternative */
		char *s;
		size_t l;
		size_t m;

		lgdebug(D_CCW, "Loop start, word%zu: cdj %s, path %s\n",
		        i, cdj ? cdj->word_string : "NULL",
		        lwg_path[i] ? lwg_path[i]->subword : "NULL");

		w = lwg_path[i];
		nw = lwg_path[i+1];
		wgp = &lwg_path[i];
		sentence_word = wg_get_sentence_word(sent, w);

		/* FIXME If the original word was capitalized in a capitalizable
		 * position, the displayed null word may be its downcase version. */

		if (NULL == cdj) /* a null word (the chosen disjunct was NULL) */
		{
			chosen_words[i] = NULL;
			nbsize++;
			if (NULL == nullblock_start) /* it starts a new null block */
				nullblock_start = wgp;

			at_nullblock_end = (NULL == nw) ||
				(wg_get_sentence_word(sent, nw->unsplit_word) != sentence_word);

			/* Accumulate null words in this alternative */
			if (!at_nullblock_end && (NULL == cdjp[i+1]) &&
			    ((w->morpheme_type == MT_PUNC) == (nw->morpheme_type == MT_PUNC)))
			{
				lgdebug(D_CCW, "Skipping word%zu cdjp=NULL#%zu, path %s\n",
				        i, nbsize, w->subword);
				chosen_words[i] = NULL;
				continue;
			}

			if (NULL != nullblock_start)
			{
				/* If we are here, this null word is an end of a null block */
				lgdebug(+D_CCW, "Handling %zu null words at %zu: ", nbsize, i);

				if (1 == nbsize)
				{
					/* Case 1: A single null subword. */
					lgdebug(D_CCW, "A single null subword.\n");
					t = join_null_word(sent, wgp, nbsize);

					gwordlist_append(&n_lwg_path, w);
				}
				else
				{
					lgdebug(D_CCW, "Combining null subwords");
					/* Use alternative_id to check for start of alternative. */
					if (((*nullblock_start)->alternative_id == *nullblock_start)
					    && at_nullblock_end)
					{
						/* Case 2: A null unsplit_word (all-nulls alternative).*/
						lgdebug(D_CCW, " (null alternative)\n");
						t = sentence_word->subword;

						gwordlist_append(&n_lwg_path, sentence_word);
					}
					else
					{
						/* Case 3: Join together >=2 null morphemes. */
						Gword *wgnull;

						lgdebug(D_CCW, " (null partial word)\n");
						wgnull = wordgraph_null_join(sent, wgp-nbsize+1, wgp);
						gwordlist_append(&n_lwg_path, wgnull);
						t = wgnull->subword;
					}
				}

				nullblock_start = NULL;
				nbsize = 0;
				show_word[i] = true;

				if (MT_WALL != w->morpheme_type)
				{
					/* Put brackets around the null word. */
					l = strlen(t) + 2;
					s = (char *) alloca(l+1);
					s[0] = NULLWORD_START;
					strcpy(&s[1], t);
					s[l-1] = NULLWORD_END;
					s[l] = '\0';
					t = string_set_add(s, sent->string_set);
					lgdebug(D_CCW, " %s\n", t);
					/* Null words have no links, so take care not to drop them. */
				}
			}
		}
		else
		{
			/* This word has a linkage. */

			/* TODO: Suppress "virtual-morphemes", currently the dictcap ones. */
			char *sm;

			t = cdj->word_string;
			/* Print the subscript, as in "dog.n" as opposed to "dog". */

			if (0)
			{
				/* TODO */
			}
			else
			{
				/* Get rid of those ugly ".Ixx" */
				if (is_idiom_word(t))
				{
					s = strdupa(t);
					sm = strrchr(s, SUBSCRIPT_MARK); /* Possible double subscript. */
					UNREACHABLE(NULL == sm); /* We know it has a subscript. */
					*sm = '\0';
					t = string_set_add(s, sent->string_set);
				}
				else if (HIDE_MORPHO)
				{
					/* Concatenate the word morphemes together into one word.
					 * Concatenate their subscripts into one subscript.
					 * Use subscript separator SUBSCRIPT_SEP.
					 * XXX Check whether we can encounter an idiom word here.
					 * FIXME Combining contracted words is not handled yet, because
					 * combining morphemes which have non-LL links to other words is
					 * not yet implemented.
					 * FIXME Move to a separate function. */
					Gword **wgaltp;
					size_t join_len = 0;
					size_t mcnt = 0;

					/* If the alternative contains morpheme subwords, mark it
					 * for joining... */

					const Gword *unsplit_word = w->unsplit_word;
					for (wgaltp = wgp, j = i; NULL != *wgaltp; wgaltp++, j++)
					{

						if ((*wgaltp)->unsplit_word != unsplit_word) break;
						if (MT_INFRASTRUCTURE ==
						    (*wgaltp)->unsplit_word->morpheme_type) break;

						mcnt++;

						if (NULL == cdjp[j])
						{
							/* ... but not if it contains a null word */
							join_alt = false;
							break;
						}
						join_len += strlen(cdjp[j]->word_string) + 1;
						if ((*wgaltp)->morpheme_type & IS_REG_MORPHEME)
							join_alt = true;
					}

					if (join_alt)
					{
						/* Join it in two steps: 1. Base words. 2. Subscripts.
						 * FIXME? Can be done in one step (more efficient but maybe
						 * less clear).
						 * Put SUBSCRIPT_SEP between the subscripts.
						 * XXX No 1-1 correspondence between the hidden base words
						 * and the subscripts after the join, in case there are base
						 * words with and without subscripts. */

						const char subscript_sep_str[] = { SUBSCRIPT_SEP, '\0'};
						char *join = calloc(join_len + 1, 1); /* zeroed out */

						join[0] = '\0';

						/* 1. Join base words. (Could just use the unsplit_word.) */
						for (wgaltp = wgp, m = 0; m < mcnt; wgaltp++, m++)
						{
							add_morpheme_unmarked(sent, join, cdjp[i+m]->word_string,
							                      (*wgaltp)->morpheme_type);
						}

						strcat(join, subscript_mark_str()); /* tentative */

						/* 2. Join subscripts. */
						for (wgaltp = wgp, m = 0; m < mcnt; wgaltp++, m++)
						{
							/* Cannot NULLify the word - we may have links to it. */
							if (m != mcnt-1) chosen_words[i+m] = "";

							sm =  strchr(cdjp[i+m]->word_string, SUBSCRIPT_MARK);

							if (NULL != sm)
							{
								/* Supposing stem subscript is .=x (x optional) */
								if (MT_STEM == (*wgaltp)->morpheme_type)
								{
									sm += 1 + STEM_MARK_L; /* sm+strlen(".=") */
									if ('\0' == *sm) sm = NULL;
#if 0
									if ((cnt-1) == m)
									{
										/* Support a prefix-stem combination. In that case
										 * we have just nullified the combined word, so we
										 * need to move it to the position of the prefix.
										 * FIXME: May still not be good enough. */
										move_combined_word = i+m-1;

										/* And the later chosen_word assignment should be:
										 * chosen_words[-1 != move_combined_word ?
										 *    move_combined_word : i] = t;
										 */
									}
									else
									{
										move_combined_word = -1;
									}
#endif
								}
							}
							if (NULL != sm)
							{
								strcat(join, sm+1);
								strcat(join, subscript_sep_str);
							}
						}

						/* Remove an extra mark, if any */
						join_len = strlen(join);
						if ((SUBSCRIPT_SEP == join[join_len-1]) ||
							 (SUBSCRIPT_MARK == join[join_len-1]))
							join[join_len-1] = '\0';

						gwordlist_append(&n_lwg_path, sentence_word);
						t = string_set_add(join, sent->string_set);
						free(join);

						i += mcnt-1;
					}
				}
			}

			if (!join_alt) gwordlist_append(&n_lwg_path, *wgp);

			/*
			 * Add guess marks in [] square brackets, if needed, at the
			 * end of the base word. Convert the badly-printing
			 * SUBSCRIPT_MARK (hex 03 or ^C) into a period.
			 */
			if (t)
			{

				s = strdupa(t);
				sm = strrchr(s, SUBSCRIPT_MARK);
				if (sm) *sm = SUBSCRIPT_DOT;

				if ((!(w->status & WS_GUESS) && (w->status & WS_INDICT))
				    || !DISPLAY_GUESS_MARKS)
				{
					t = string_set_add(s, sent->string_set);
				}
				else
				{
					const char *regex_name = w->regex_name;
					/* 4 = 1(null) + 1(guess_mark) + 2 (sizeof "[]") */
					int baselen = NULL == sm ? strlen(t) : (size_t)(sm-s);
					char guess_mark = 0;

					switch (w->status & WS_GUESS)
					{
						case WS_SPELL:
							guess_mark = GM_SPELL;
							break;
						case WS_RUNON:
							guess_mark = GM_RUNON;
							break;
						case WS_REGEX:
							guess_mark = GM_REGEX;
							break;
						case 0:
							guess_mark = GM_UNKNOWN;
							break;
						default:
							assert(0, "Missing 'case: %2x'", w->status & WS_GUESS);
					}

					/* In the case of display_morphology==0, the guess indication of
					 * the last subword is used as the guess indication of the whole
					 * word.
					 * FIXME? The guess indications of other subwords are ignored in
					 * this mode. This implies that if a first or middle subword has
					 * a guess indication but the last subword doesn't have, no guess
					 * indication would be shown at all. */

					if ((NULL == regex_name) || HIDE_MORPHO) regex_name = "";
					s = alloca(strlen(t) + strlen(regex_name) + 4);
					strncpy(s, t, baselen);
					s[baselen] = '[';
					s[baselen + 1] = guess_mark;
					strcpy(s + baselen + 2, regex_name);
					strcat(s, "]");
					if (NULL != sm) strcat(s, sm);
					t = string_set_add(s, sent->string_set);
				}
			}
		}

		assert(t != NULL, "Word %zu: NULL", i);
		chosen_words[i] = t;
	}

	/* Conditional test removal of quotation marks and the "capdict" tokens,
	 * to facilitate using diff on sentence batch runs. */
	if (test_enabled("removeZZZ"))
	{
		for (i=0, j=0; i<linkage->num_links; i++)
		{
			Link *lnk = &(linkage->link_array[i]);

			if (0 == strcmp("ZZZ", lnk->link_name))
				chosen_words[lnk->rw] = NULL;
		}
	}

	/* If morphology printing is being suppressed, then all links
	 * connecting morphemes will be discarded. */
	if (HIDE_MORPHO)
	{
		/* Discard morphology links. */
		for (i=0; i<linkage->num_links; i++)
		{
			Link * lnk = &linkage->link_array[i];

			if (is_morphology_link(lnk->link_name))
			{
				/* Mark link for discarding. */
				lnk->link_name = NULL;
			}
			else
			{
				/* Mark word for not discarding. */
				show_word[lnk->rw] = true;
				show_word[lnk->lw] = true;
			}
		}
	}

	/* We alloc a little more than needed, but so what... */
	linkage->word = (const char **) exalloc(linkage->num_words*sizeof(char *));

	/* Copy over the chosen words, dropping the discarded words.
	 * However, don't discard existing words (chosen_words[i][0]).
	 * Note that if a word only has morphology links and is not combined with
	 * another word, then it will get displayed with no links at all (e.g.
	 * when explicitly specifying root and suffix for debug: root.= =suf */
	for (i=0, j=0; i<linkage->num_words; ++i)
	{
		if (chosen_words[i] &&
		    (chosen_words[i][0] || (!HIDE_MORPHO || show_word[i])))
		{
			const char *cwtmp = linkage->word[j];
			linkage->word[j] = chosen_words[i];
			chosen_words[i] = cwtmp;
			remap[i] = j;
			j++;
		}
		else
		{
			remap[i] = -1;
		}
	}
	linkage->num_words = j;

	remap_linkages(linkage, remap); /* Update linkage->link_array / num_links. */

	linkage->wg_path_display = n_lwg_path;

	if (verbosity_level(D_CCW))
		print_lwg_path(n_lwg_path, "Display");
}
Beispiel #22
0
bool sane_linkage_morphism(Sentence sent, Linkage lkg, Parse_Options opts)
{
	Wordgraph_pathpos *wp_new = NULL;
	Wordgraph_pathpos *wp_old = NULL;
	Wordgraph_pathpos *wpp;
	Gword **next; /* next Wordgraph words of the current word */

	size_t i;
	Linkage_info * const lifo = &lkg->lifo;

	bool match_found = true; /* if all the words are null - it's still a match */
	Gword **lwg_path;

	Dictionary afdict = sent->dict->affix_table;       /* for SANEMORPHISM */
	char *const affix_types = alloca(sent->length*2 + 1);   /* affix types */

	affix_types[0] = '\0';

	/* Populate the path word queue, initializing the path to NULL. */
	for (next = sent->wordgraph->next; *next; next++)
	{
		wordgraph_path_append(&wp_new, /*path*/NULL, /*add_word*/NULL, *next);
	}
	assert(NULL != wp_new, "Path word queue is empty");

	for (i = 0; i < lkg->num_words; i++)
	{
		Disjunct *cdj;            /* chosen disjunct */

		lgdebug(D_SLM, "%p Word %zu: ", lkg, i);

		if (NULL == wp_new)
		{
			lgdebug(+D_SLM, "- No more words in the wordgraph\n");
			match_found = false;
			break;
		}

		if (wp_old != wp_new)
		{
			wordgraph_path_free(wp_old, true);
			wp_old = wp_new;
		}
		wp_new = NULL;
		//wordgraph_pathpos_print(wp_old);

		cdj = lkg->chosen_disjuncts[i];
		/* Handle null words */
		if (NULL == cdj)
		{
			lgdebug(D_SLM, "- Null word\n");
			/* A null word matches any word in the Wordgraph -
			 * so, unconditionally proceed in all paths in parallel. */
			match_found = false;
			for (wpp = wp_old; NULL != wpp->word; wpp++)
			{
				if (NULL == wpp->word->next)
					continue; /* This path encountered the Wordgraph end */

				/* The null words cannot be marked here because wpp->path consists
				 * of pointers to the Wordgraph words, and these words are common to
				 * all the linkages, with potentially different null words in each
				 * of them. However, the position of the null words can be inferred
				 * from the null words in the word array of the Linkage structure.
				 */
				for (next = wpp->word->next; NULL != *next; next++)
				{
					match_found = true;
					wordgraph_path_append(&wp_new, wpp->path, wpp->word, *next);
				}
			}
			continue;
		}

		if (!match_found)
		{
			const char *e = "Internal error: Too many words in the linkage\n";
			lgdebug(D_SLM, "- %s", e);
			prt_error("Error: %s.", e);
			break;
		}

		assert(MT_EMPTY != cdj->word[0]->morpheme_type); /* already discarded */

		if (debug_level(D_SLM)) print_with_subscript_dot(cdj->string);

		match_found = false;
		/* Proceed in all the paths in which the word is found. */
		for (wpp = wp_old; NULL != wpp->word; wpp++)
		{
			const Gword **wlp; /* disjunct word list */

			for (wlp = cdj->word; *wlp; wlp++)
			{
				if (*wlp == wpp->word)
				{
					match_found = true;
					for (next = wpp->word->next; NULL != *next; next++)
					{
						wordgraph_path_append(&wp_new, wpp->path, wpp->word, *next);
					}
					break;
				}
			}
		}

		if (!match_found)
		{
			/* FIXME? A message can be added here if there are too many words
			 * in the linkage (can happen only if there is an internal error). */
			lgdebug(D_SLM, "- No Wordgraph match\n");
			break;
		}
		lgdebug(D_SLM, "\n");
	}

	if (match_found)
	{
		match_found = false;
		/* Validate that there are no missing words in the linkage. It is so if
		 * the dummy termination word is found in the new pathpos queue. */
		if (NULL != wp_new)
		{
			for (wpp = wp_new; NULL != wpp->word; wpp++)
			{
				if (MT_INFRASTRUCTURE == wpp->word->morpheme_type) {
					match_found = true;
					/* Exit the loop with with wpp of the termination word. */
					break;
				}
			}
		}
		if (!match_found)
		    lgdebug(D_SLM, "%p Missing word(s) at the end of the linkage.\n", lkg);
	}

#define DEBUG_morpheme_type 0
	/* Check the morpheme type combination.
	 * If null_count > 0, the morpheme type combination may be invalid
	 * due to null subwords, so skip this check. */
	if (match_found && (0 == sent->null_count) &&
		(NULL != afdict) && (NULL != afdict->regex_root))
	{
		const Gword **w;
		char *affix_types_p = affix_types;

		/* Construct the affix_types string. */
#if DEBUG_morpheme_type
		print_lwg_path(wpp->path);
#endif
		i = 0;
		for (w = wpp->path; *w; w++)
		{
			i++;
			if (MT_EMPTY == (*w)->morpheme_type) continue; /* really a null word */

#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wswitch-enum"
			switch ((*w)->morpheme_type)
			{
#pragma GCC diagnostic pop
				default:
					/* What to do with the rest? */
				case MT_WORD:
					*affix_types_p = AFFIXTYPE_WORD;
					break;
				case MT_PREFIX:
					*affix_types_p = AFFIXTYPE_PREFIX;
					break;
				case MT_STEM:
					*affix_types_p = AFFIXTYPE_STEM;
					break;
				case MT_MIDDLE:
					*affix_types_p = AFFIXTYPE_MIDDLE;
					break;
				case MT_SUFFIX:
					*affix_types_p = AFFIXTYPE_SUFFIX;
					break;
			}

#if DEBUG_morpheme_type
			lgdebug(D_SLM, "Word %zu: %s affixtype=%c\n",
			     i, (*w)->subword,  *affix_types_p);
#endif

			affix_types_p++;
		}
		*affix_types_p = '\0';

#ifdef WORD_BOUNDARIES /* not yet implemented */
		{
			const Gword *uw;

			/* If w is an "end subword", return its unsplit word, else NULL. */
			uw = word_boundary(w); /* word_boundary() unimplemented */

			if (NULL != uw)
			{
				*affix_types_p++ = AFFIXTYPE_END;
				lgdebug(D_SLM, "%p End of Gword %s\n", lkg, uw->subword);
			}
		}
#endif

		/* Check if affix_types is valid according to SANEMORPHISM. */
		if (('\0' != affix_types[0]) &&
		    (NULL == match_regex(afdict->regex_root, affix_types)))
		{
			/* Morpheme type combination is invalid */
			match_found = false;
			/* Notify to stdout, so it will be shown along with the result.
			 * XXX We should have a better way to notify. */
			if (0 < opts->verbosity)
				printf("Warning: Invalid morpheme type combination '%s', "
				       "run with !bad and !verbosity=4 to debug\n", affix_types);
		}
	}

	if (match_found) lwg_path = (Gword **)wpp->path; /* OK to modify */
	wordgraph_path_free(wp_old, true);
	wordgraph_path_free(wp_new, !match_found);

	if (match_found)
	{
		if ('\0' != affix_types[0])
		{
			lgdebug(D_SLM, "%p Morpheme type combination '%s'\n", lkg, affix_types);
		}
		lgdebug(+D_SLM, "%p SUCCEEDED\n", lkg);
		lkg->wg_path = lwg_path;
		return true;
	}

	/* Oh no ... invalid morpheme combination! */
	sent->num_valid_linkages --;
	lifo->N_violations++;
	lifo->pp_violation_msg = "Invalid morphism construction.";
	lkg->wg_path = NULL;
	lifo->discarded = true;
	lgdebug(D_SLM, "%p FAILED\n", lkg);
	return false;
}
Beispiel #23
0
/**
 * Split randomly.
 * Return true on success.
 * Return false when:
 * - disabled (i.e. when doing regular language processing).
 * - an error occurs (the behavior then is undefined).
 *   Such an error has not been observed yet.
 */
bool anysplit(Sentence sent, const char *word)
{
	Dictionary afdict = sent->dict->affix_table;
	anysplit_params *as;
	Afdict_class * stemsubscr;
	size_t stemsubscr_len;

	size_t l = strlen(word);
	p_list pl;
	size_t pos;
	int p;
	int sample_point;
	size_t nsplits;
	size_t rndtried = 0;
	size_t rndissued = 0;
	size_t i;
	unsigned int seed = 0;
	char *prefix_string = alloca(l+2+1); /* word + ".=" + NUL */
	char *suffix_string = alloca(l+1);   /* word + NUL */
	bool use_sampling = true;
	const char infix_mark = INFIX_MARK(afdict);


	if (NULL == afdict) return false;
	as = afdict->anysplit;

	if ((NULL == as) || (0 == as->nparts)) return false; /* Anysplit disabled */

	if (0 == l)
	{
		prt_error("Warning: anysplit(): word length 0\n");
		return false;
	}

	stemsubscr = AFCLASS(afdict, AFDICT_STEMSUBSCR);
	stemsubscr_len = (NULL == stemsubscr->string[0]) ? 0 :
		strlen(stemsubscr->string[0]);

	/* Don't split morphemes again. If INFIXMARK and/or SUBSCRMARK are
	 * not defined in the affix file, then morphemes may get split again unless
	 * restricted by REGPRE/REGMID/REGSUF. */
	if (word[0] == infix_mark) return true;
	if ((l > stemsubscr_len) &&
	    (0 == strcmp(word+l-stemsubscr_len, stemsubscr->string[0])))
		return true;

	// seed = time(NULL)+(unsigned int)(long)&seed;

#if DEBUG_ANYSPLIT
	gw = word;
#endif

	nsplits = split(l, as->nparts, &as->scl[l]);
	if (0 == nsplits)
	{
		prt_error("Warning: anysplit(): split() failed (shouldn't happen)\n");
		return false;
	}

	if (as->altsmax >= nsplits)
	{
		/* Issue everything */
		sample_point = -1;
		use_sampling = false;
	}

	lgdebug(+2, "Start%s sampling: word=%s, nsplits=%zu, maxsplits=%d, "
	        "as->altsmin=%zu, as->altsmax=%zu\n", use_sampling ? "" : " no",
	        word, nsplits, as->nparts, as->altsmin, as->altsmax);

	while (rndtried < nsplits && (!use_sampling || (rndissued < as->altsmin)))
	{
		if (use_sampling)
		{
			sample_point = rng_uniform(&seed, nsplits);

			if (sample_point < 0) /* Cannot happen with rand_r() */
			{
				prt_error("Error: rng: %s\n", strerror(errno));
				return false;
			}
		}
		else
		{
			sample_point++;
		}

		lgdebug(2, "Sample: %d ", sample_point);
		if (as->scl[l].p_tried[sample_point])
		{
			lgdebug(4, "(repeated)\n");
			continue;
		}
		lgdebug(4, "(new)");
		rndtried++;
		as->scl[l].p_tried[sample_point] = true;
		if (morpheme_match(sent, word, l, &as->scl[l].sp[sample_point*as->nparts]))
		{
			as->scl[l].p_selected[sample_point] = true;
			rndissued++;
		}
		else
		{
			lgdebug(2, "\n");
		}
	}

	lgdebug(2, "Results: word '%s' (length=%zu): %zu/%zu:\n", word, l, rndissued, nsplits);

	for (i = 0; i < nsplits; i++)
	{
		const char **suffixes = NULL;
		int num_suffixes = 0;

		if (!as->scl[l].p_selected[i]) continue;

		pl = &as->scl[l].sp[i*as->nparts];
		pos = 0;
		for (p = 0; p < as->nparts; p++)
		{
			if (pl[0] == (int)l)  /* This is the whole word */
			{
				strncpy(prefix_string, &word[pos], pl[p]-pos);
				prefix_string[pl[p]-pos] = '\0';
			}
			else
			if (0 == pos)   /* The first but not the only morpheme */
			{
				strncpy(prefix_string, &word[pos], pl[p]-pos);
				prefix_string[pl[p]-pos] = '\0';

				if (0 != stemsubscr->length)
				    strcat(prefix_string, stemsubscr->string[0]);
			}
			else           /* 2nd and on morphemes */
			{
				strncpy(suffix_string, &word[pos], pl[p]-pos);
				suffix_string[pl[p]-pos] = '\0';
				altappend(sent, &suffixes, suffix_string);
				num_suffixes++;
			}

			pos = pl[p];
			if (pos == l) break;
		}

		/* Here a leading INFIX_MARK is added to the suffixes if needed. */
		add_alternative(sent,
		   0,NULL, 1,(const char **)&prefix_string, num_suffixes,suffixes);
		free(suffixes);
	}

	return true;
}
Beispiel #24
0
/**
 * This fills the linkage array with morphologically-acceptable
 * linkages.
 */
static void process_linkages(Sentence sent, extractor_t* pex,
                             bool overflowed, Parse_Options opts)
{
	if (0 == sent->num_linkages_found) return;
	if (0 == sent->num_linkages_alloced) return; /* Avoid a later crash. */

	/* Pick random linkages if we get more than what was asked for. */
	bool pick_randomly = overflowed ||
	    (sent->num_linkages_found > (int) sent->num_linkages_alloced);

	sent->num_valid_linkages = 0;
	size_t N_invalid_morphism = 0;

	int itry = 0;
	size_t in = 0;
	int maxtries;

	/* In the case of overflow, which will happen for some long
	 * sentences, but is particularly common for the amy/ady random
	 * splitters, we want to find as many morpho-acceptable linkages
	 * as possible, but keep the CPU usage down, as these might be
	 * very rare. This is due to a bug/feature in the interaction
	 * between the word-graph and the parser: valid morph linkages
	 * can be one-in-a-thousand.. or worse.  Search for them, but
	 * don't over-do it.
	 * Note: This problem has recently been alleviated by an
	 * alternatives-compatibility check in the fast matcher - see
	 * alt_connection_possible().
	 */
#define MAX_TRIES 250000

	if (pick_randomly)
	{
		/* Try picking many more linkages, but not more than possible. */
		maxtries = MIN((int) sent->num_linkages_alloced + MAX_TRIES,
		               sent->num_linkages_found);
	}
	else
	{
		maxtries = sent->num_linkages_alloced;
	}

	bool need_init = true;
	for (itry=0; itry<maxtries; itry++)
	{
		Linkage lkg = &sent->lnkages[in];
		Linkage_info * lifo = &lkg->lifo;

		/* Negative values tell extract-links to pick randomly; for
		 * reproducible-rand, the actual value is the rand seed. */
		lifo->index = pick_randomly ? -(itry+1) : itry;

		if (need_init)
		{
			partial_init_linkage(sent, lkg, sent->length);
			need_init = false;
		}
		extract_links(pex, lkg);
		compute_link_names(lkg, sent->string_set);

		if (verbosity_level(+D_PL))
		{
			err_msg(lg_Debug, "chosen_disjuncts before:\n\\");
			print_chosen_disjuncts_words(lkg, /*prt_opt*/true);
		}

		if (sane_linkage_morphism(sent, lkg, opts))
		{
			remove_empty_words(lkg);

			if (verbosity_level(+D_PL))
			{
				err_msg(lg_Debug, "chosen_disjuncts after:\n\\");
				print_chosen_disjuncts_words(lkg, /*prt_opt*/false);
			}

			need_init = true;
			in++;
			if (in >= sent->num_linkages_alloced) break;
		}
		else
		{
			N_invalid_morphism++;
			lkg->num_links = 0;
			lkg->num_words = sent->length;
			// memset(lkg->link_array, 0, lkg->lasz * sizeof(Link));
			memset(lkg->chosen_disjuncts, 0, sent->length * sizeof(Disjunct *));
		}
	}

	/* The last one was alloced, but never actually used. Free it. */
	if (!need_init) free_linkage(&sent->lnkages[in]);

	sent->num_valid_linkages = in;

	/* The remainder of the array is garbage; we never filled it in.
	 * So just pretend that it's shorter than it is */
	sent->num_linkages_alloced = sent->num_valid_linkages;

	lgdebug(D_PARSE, "Info: sane_morphism(): %zu of %d linkages had "
	        "invalid morphology construction\n", N_invalid_morphism,
	        itry + (itry != maxtries));
}
Beispiel #25
0
/* Was main() of the test program... */
static int regex_split(const char *inpat, int flags, const char *str, Dictionary dict)
{
	const char *p;
	dyn_str *pat;
	int plevel;  /* paren level */
	int cglevel; /* capture group level */
	int nplevel;  /* paren level within named capture group */
	int icgnum;  /* capture group number*/
	int options;
	const char *errptr;
	int erroffset;
	pcre *pcre;
	const char * const prog = "regex_tokenizer_test";
	int rc;
	pcre_extra *extra = NULL;
#define OVCNT 15
	int ovector[OVCNT];
	callout_data_t callout_data;

#if 0
	const char **wordlist;
#endif
	bool word_compare_flag = true;
#ifdef notdef
	dyn_str *wordalts;
#endif
	const char *group_name = NULL;
	char *word_classname;
	char c0[2] = "\0\0";

	/* FIXME: validate we use PCRE version 2 at least. */

	/* Find the number of capturing groups in the input pattern. */
	icgnum = 0;
	for (p = inpat; '\0' != *p; p++)
	{
		/* Count as capture groups only (string) or (?<name>). Especially, avoid
		 * counting (?<=...) (positive look behind) and (?(condition)...) (the
		 * (condition) part).
		 * FIXME: support () inside [].
		 * FIXME: support \. */
		if ((*p == '(') && (*p != '*') &&
		    ((p[1] != '?') || ((p[2] == '<') && (p[3] != '='))) &&
			 ((p-inpat < 2) || (p[-2] != '(') || (p[-1] != '?')))
		{
			icgnum++;
		}
	}
	if (0 == icgnum)
	{
		printf("%s: pattern must include at least one () group (was: %s)\n", prog, inpat);
		return 9;
	}
#if 0
	if (p[-1] != '$')
	{
		/* FIXME: add $ if needed */
		printf("%s: pattern must end with $ (was: %s)\n", prog, inpat);
		return 9;
	}
#endif

	/* Regex syntax check of the pattern.
	 * FIXME: Add support for "(?J)" */
	options = PCRE_UTF8;
	pcre = pcre_compile(inpat, options, &errptr, &erroffset, NULL);
	if (NULL == pcre)
	{
		printf("%s: pcre_compile: Error in pattern '%s' at offset %d: %s\n",
		       prog, inpat, erroffset, errptr);
		return 2;
	}

	callout_data.wordlist = NULL;
	callout_data.cgnum = NULL;
	if (word_compare_flag)
	{
		int i;
#if 0
		callout_data.wordlist = malloc(sizeof(*callout_data.wordlist)*icgnum);
#endif
		callout_data.cgnum = malloc(sizeof(*callout_data.cgnum)*icgnum);
		//printf("ALLOCATED callout_data.cgnum %ld for %d groups\n",
		//sizeof(*callout_data.wordlist)*cgnum, icgnum);
		for (i = 0; i < icgnum; i++)
		{
#if 0
			callout_data.wordlist[i] = NULL;
#endif
			callout_data.cgnum[i] = NULL;

		}
	}

	/* Build the pattern that finds all possible matches. */
	pat = dyn_str_new();
	plevel = 0;
	cglevel = 0;
	icgnum = -1; /* First capture group (plevel==1) is icgnum==0. */

	/* Convert the input regex to the tokenizer regex.
	 * cglevel counts named capture groups
	 * plevel counts all groups
	 *
	 * FIXME: Add support for:
	 * (?x) - comment mode.
	 * (?i) - ignore case.
	 * \ - backslash for ()<>?* .
	 * [] - () inside it
	 * FIXME: Add "(?: ... )" over the result pattern.
	 */
	//dyn_strcat(pat, "(?J)");
	for (p = inpat; '\0' != *p; p++)
	{
		char *re = NULL; /* a regex from the 4.0.regex file */

		switch (*p)
		{
		const char *c;

		case '(':
			if (cglevel > 0)
			{
				printf("Error at position %ld: Tokenizer capture groups cannot have nested groups\n", p-inpat);
			}
			plevel++;
			if ((p[1] == '*') ||
			    ((p[1] == '?') && ((p[2] != '<') || (p[3] == '='))) ||
			    ((p-inpat > 1) && (p[-2] == '(') && (p[-1] == '?')))
			{
				break;
			}
			cglevel++;
			if (cglevel > 1)
			{
				printf("Error at position %ld: Tokenizer aregex cannot have capture group level > 1\n", p-inpat);
				free(callout_data.cgnum);
				return 199;
			}
			icgnum++;
			dyn_strcat(pat, "(?:");
			group_name = NULL;
			break;
		case ')':
			plevel--;
			if (cglevel > 0)
			{
				cglevel--;
				/* Add the dict lookup and capturing callback. */
				dyn_strcat(pat, ")(?C)");
			}
			group_name = NULL;
			break;
		case '<':
			/* Remember it as a potential start of a named group. */
			if ((p-2 >= inpat) && (p[-2] == '(') && (p[-1] == '?') && (p[1]  != '='))
			{
				group_name = p + 1;
			}
			else
				group_name = NULL;
			break;
		case '>':
			if (NULL != group_name)
			{
				/* Check if this is actually a group name */
				for (c = group_name; c < p; c++)
				{
					/* FIXME: 'a' and 'p' are part of a hack for lookup_mark.
					 * FIXME: 'r' is part of a hack for regex names that match affix
					 * class names. The fix is not to use matching names. */
					if ((*c > 'Z' || *c < 'A') && *c != 'a' && *c != 'p' && *c != 'r') break;
				}
				if (c == p)
				{
					word_classname = malloc(p-group_name+1);
					strncpy(word_classname, group_name, p-group_name);
					word_classname[p-group_name] = '\0';
				} else
				{
					printf("%s: Invalid class name in group name found at '%s'\n",
					       prog, group_name-4);
					word_classname = NULL;
				}
			} else
			{
					word_classname = NULL;
			}
			if (!word_classname)
			{
				group_name = NULL;
				break;
			}
			dyn_strcat(pat, ">");

			lgdebug(6, "Found word-class %s\n", word_classname);
#if 0
			wordlist = readwords(word_classname);
			if (NULL == wordlist)
			{
				printf("i%s: Invalid class name %s in group name\n", prog, word_classname);
				return 100;
			}

			if (!word_compare_flag)
			{
				printf("Invocation without -w is not supported\n");
				return 103;
			}
#endif

			if (word_compare_flag)
			{
				char *t;
				const char *lookup_mark = NULL;
#if 0
				callout_data.wordlist[icgnum] = wordlist;
				printf("WORDLIST %p at cgnum %d\n", wordlist, icgnum);
#endif
				/* Allocate per group info  */
				callout_data.cgnum[icgnum] = malloc(sizeof(*(callout_data.cgnum)[0]));
				callout_data.cgnum[icgnum]->name = NULL;
				//printf("ALLOCATED cgnum[%d]=%p\n", icgnum,
				//callout_data.cgnum[icgnum]);

				/* A hack for testing: Handle WORDpX or WORDaX.
				 * The above a/p marks mean append/prepend X to word before making
				 * the lookup.
				 * FIXME: Find another way to specify that, maybe in the affix file
				 * or in a tokenizer definition file. */
				t = strpbrk(word_classname, "pa");
				if (NULL != t)
				{
					Afdict_class *ac;

					callout_data.cgnum[icgnum]->lookup_mark_pos = *t;
					*t = '\0';
					ac = afdict_find(dict->affix_table, t+1, /*notify_err*/false);
					if (NULL == ac)
					{
						printf("%s: Unknown afclass '%s'\n", prog, t+1);
						return 253;
					}

					/* Check if the requested affix class is defined and is not an
					 * empty string (like the default INFIXMARK). */
					if (0 == ac->length || '\0' == ac->string[0][0])
					{
						printf("%s: No value for afclass '%s'\n", prog, t+1);
						return 252;
					}
					lookup_mark = ac->string[0]; /* FIXME: support more than one value. */
				}

				callout_data.cgnum[icgnum]->lookup_mark = lookup_mark;
				callout_data.cgnum[icgnum]->name = word_classname;

				if (0 == strcmp(word_classname, "DICTWORD"))
				{
					/* Assign data for looking up a word in the main dict. */
					callout_data.cgnum[icgnum]->dict = dict;
					callout_data.cgnum[icgnum]->afclass = NULL;
				}
				else
				if (afdict_find(dict->affix_table, word_classname, /*notify_err*/false))
				{
					callout_data.cgnum[icgnum]->dict = dict->affix_table;
					callout_data.cgnum[icgnum]->afclass = word_classname;
				}
				else
				{
					if ('r' == word_classname[0]) word_classname++;
					re = get_regex_by_name(dict, word_classname);
					if (re)
					{
						lgdebug(6, "Regex %s with modified groups: '%s'\n", word_classname, re);
						callout_data.cgnum[icgnum]->dict = NULL;
						/* FIXME: No need to allocate callout_data.cgnum[icgnum] in this
						 * case. */
					}
					else
					{
						printf("%s: Unknown word classname '%s'\n", prog, word_classname);
						return 254;
					}
				}
				/* TODO: Assign flags, e.g. for emitting the words with stem/infix marks. */

			} else
			{
#if 0
				wordalts = make_wordalts(wordlist);
				dyn_strcat(pat, wordalts->str);
				dyn_str_delete(wordalts);
				free(wordlist);
#else
				printf("%s: Invocation without -w is not supported\n", prog);
				return 103;
#endif
			}
			/* Default match for dictionary lookup is ".*".
			 * Allow replacing it by something else.
			 * E.g: .{2,}|a */
			if (')' == p[1])
			{
				if (NULL == re)
				{
					dyn_strcat(pat, ".*");
				}
				else
				{
					dyn_strcat(pat, re);
					free(re);
					re = NULL;
				}
			}
			else
			{
				nplevel = 1;
				/* FIXME: Add support for:
				 * (?x) - comment mode.
				 * \ - backslash for ()<>?* .
				 * [] - () inside it
				 */
				for (; p[1] != '\0' && nplevel > 0; p++)
				{
					switch (p[1])
					{
					case '(':
						if (('?' != p[2]) && ('*' != p[2]) &&
						    ((p[-1] != '(') || (p[0] != '?')))
						{
							printf("%s: Capture_group %d: Nested capture group is not supported\n",
							       prog, icgnum+1);
							return 250;
						}
						nplevel++;
						break;
					case ')':
						nplevel--;
						if (0 == nplevel) continue; /* we are done */
						break;
					}

					c0[0] = p[1];
					dyn_strcat(pat, c0);
				}
				p--;
			}

			word_classname = NULL;
			group_name = NULL;
			continue;
		}

		c0[0] = *p;
		dyn_strcat(pat, c0);
	}

	/* Add '$' at the end if needed. */
	if ('$' != pat->str[pat->end-1]) dyn_strcat(pat, "$");
	/* Add the backtracking callback. */
	dyn_strcat(pat, "(?C1)");

	printf("Modified pattern: %s", pat->str);
	lgdebug(2, " (len %zu/%zu)", pat->end, pat->len);
	printf("\n");

	pcre_callout = callout;

	callout_data.function = 1;
	callout_data.subp_i = 0;
	callout_data.subp[0].s = 0;
	callout_data.subp[0].e = SUBP0END_DEBUG_SIGNATURE;
	callout_data.subp_ovfl = false;
	callout_data.capture_last = 0;
	callout_data.pattern = pat->str;
	callout_data.alt_counter = 0;

	options = PCRE_UTF8;
	pcre = pcre_compile(pat->str, options, &errptr, &erroffset, NULL);
	if (NULL == pcre)
	{
		printf("%s: Internal error: pcre_compile: Error in pattern '%s' at offset %d: %s\n",
		       prog, pat->str, erroffset, errptr);
		return 99;
	}

	/* TODO: Check if using JIT may optimize out some needed callouts. */
	options = 0; //PCRE_STUDY_JIT_COMPILE;
	extra  = pcre_study(pcre, options, &errptr);
	if (NULL == extra)
	{
		if (NULL != errptr)
		{
			printf("%s: pcre_study: Error for pattern '%s': %s\n", prog, pat->str, errptr);
			return 3;
		}
		extra = malloc(sizeof(*extra));
		memset(extra, 0, sizeof(*extra));
	} else
	{
		/* For some reason JIT is sometimes done even though it was not requested.
		 * But the callouts are still invoked as expected in such cases. */
		lgdebug(6, "%s: pcre_study: JIT %ld\n", prog, extra->flags & PCRE_STUDY_JIT_COMPILE);
	}

#if 0
	extra->match_limit = 10000;
	extra->match_limit_recursion = 10000;
	extra->flags |= PCRE_EXTRA_MATCH_LIMIT|PCRE_EXTRA_MATCH_LIMIT_RECURSION;
#endif

	extra->callout_data = (void *)&callout_data;
	extra->flags |= PCRE_EXTRA_CALLOUT_DATA;

#if 0
	printf("CGNUM %d\n", icgnum);
	if (NULL != callout_data.cgnum)
	{
		int i;

		for (i = 0; i <= icgnum; i++)
		{
			printf("callout_data.cgnum[%d] %p\n", i, callout_data.cgnum[i]);
		}
	} else
		printf("CGNUM %p\n", callout_data.cgnum);
#endif

	options = PCRE_ANCHORED; /* XXX Maybe PCRE_NO_START_OPTIMIZE is needed too */
	rc = pcre_exec(pcre, extra, str, strlen(str), 0, options, ovector, OVCNT);
	if (rc < 0)
	{
		if (PCRE_ERROR_NOMATCH == rc)
		{
			lgdebug(2, "No match (must always happen)\n");
		} else
		{
			printf("%s: pcre_exec: Error %d\n", prog, rc);
		}
	} else
	{
		printf("Internal error: Unexpected match, rc=%d\n", rc);
	}

	if (0 == rc)
	{
	  rc = OVCNT/3;
	  printf("ovector only has room for %d captured substrings\n", rc - 1);
	}

	printov(str, (ov_t *)ovector, rc, NULL, /*is_pcreov*/true);

	if (verbosity > 6)
	{
		if (0 != callout_data.subp_i)
		{
			printf("Callout stack:\n");
			printov(str, callout_data.subp, callout_data.subp_i, &callout_data, /*is_pcreov*/false);
		}
	}

	/* Free everything. */
	dyn_str_delete(pat); /* note - callback_data uses parts of pat */
	pcre_free_study(extra); /* safe even if malloc'ed */
	free(pcre);

	if (NULL != callout_data.cgnum)
	{
		int i;

		for (i = 0; i <= icgnum; i++)
		{
			if (callout_data.cgnum[i])
			{
				/* FIXME: Free also word_classname. */
				free(callout_data.cgnum[i]);
			}
		}
		free(callout_data.cgnum);
	}

#if 0
	if (NULL != callout_data.wordlist)
	{
		int i;

		for (i = 0; i < icgnum; i++)
		{
			free(callout_data.wordlist[i]);
		}
		free(callout_data.wordlist);
	}
#endif

	return 0;
}
Beispiel #26
0
static int pp_prune(Sentence sent, Parse_Options opts)
{
	pp_knowledge * knowledge;
	size_t i, w;
	int total_deleted, N_deleted;
	bool change, deleteme;
	multiset_table *cmt;

	if (sent->postprocessor == NULL) return 0;
	if (!opts->perform_pp_prune) return 0;

	knowledge = sent->postprocessor->knowledge;

	cmt = cms_table_new();

	for (w = 0; w < sent->length; w++)
	{
		Disjunct *d;
		for (d = sent->word[w].d; d != NULL; d = d->next)
		{
			char dir;
			d->marked = true;
			for (dir=0; dir < 2; dir++)
			{
				Connector *c;
				for (c = ((dir) ? (d->left) : (d->right)); c != NULL; c = c->next)
				{
					insert_in_cms_table(cmt, connector_string(c));
				}
			}
		}
	}

	total_deleted = 0;
	change = true;
	while (change)
	{
		char dir;

		change = false;
		N_deleted = 0;
		for (w = 0; w < sent->length; w++)
		{
			Disjunct *d;
			for (d = sent->word[w].d; d != NULL; d = d->next)
			{
				if (!d->marked) continue;
				deleteme = false;
				for (i = 0; i < knowledge->n_contains_one_rules; i++)
				{
					pp_rule* rule = &knowledge->contains_one_rules[i]; /* the ith rule */
					const char * selector = rule->selector;  /* selector string for this rule */
					pp_linkset * link_set = rule->link_set;  /* the set of criterion links */

					if (rule->selector_has_wildcard) continue;  /* If it has a * forget it */

					for (dir = 0; dir < 2; dir++)
					{
						Connector *c;
						for (c = ((dir) ? (d->left) : (d->right)); c != NULL; c = c->next)
						{

							if (!post_process_match(selector, connector_string(c))) continue;

							/*
							printf("pp_prune: trigger ok.  selector = %s  c->string = %s\n", selector, c->string);
							*/

							/* We know c matches the trigger link of the rule. */
							/* Now check the criterion links */

							if (!rule_satisfiable(cmt, link_set))
							{
								deleteme = true;
								rule->use_count++;
							}
							if (deleteme) break;
						}
						if (deleteme) break;
					}
					if (deleteme) break;
				}

				if (deleteme)         /* now we delete this disjunct */
				{
					N_deleted++;
					total_deleted++;
					d->marked = false; /* mark for deletion later */
					for (dir=0; dir < 2; dir++)
					{
						Connector *c;
						for (c = ((dir) ? (d->left) : (d->right)); c != NULL; c = c->next)
						{
							change |= delete_from_cms_table(cmt, connector_string(c));
						}
					}
				}
			}
		}

		lgdebug(D_PRUNE, "Debug: pp_prune pass deleted %d\n", N_deleted);
	}
	cms_table_delete(cmt);

	if (total_deleted > 0)
	{
		delete_unmarked_disjuncts(sent);
		if (verbosity_level(D_PRUNE))
		{
			prt_error("\n\\");
			prt_error("Debug: After pp_prune:\n\\");
			print_disjunct_counts(sent);
		}
	}

	print_time(opts, "pp pruning");

	return total_deleted;
}
Beispiel #27
0
/**
 * classic_parse() -- parse the given sentence.
 * Perform parsing, using the original link-grammar parsing algorithm
 * given in the original link-grammar papers.
 *
 * Do the parse with the minimum number of null-links within the range
 * specified by opts->min_null_count and opts->max_null_count.
 *
 * To that end, call do_parse() with an increasing null_count, from
 * opts->min_null_count up to (including) opts->max_null_count, until a
 * parse is found.
 *
 * A note about the disjuncts save/restore that is done here:
 * To increase the parsing speed, before invoking do_parse(),
 * pp_and_power_prune() is invoked to remove connectors which have no
 * possibility to connect. It includes a significant optimization when
 * null_count==0 that makes a more aggressive removal, but this
 * optimization is not appropriate when null_count>0.
 *
 * So in case this optimization has been done and a complete parse (i.e.
 * a parse when null_count==0) is not found, we are left with sentence
 * disjuncts which are not appropriate to continue do_parse() tries with
 * null_count>0. To solve that, we need to restore the original
 * disjuncts of the sentence and call pp_and_power_prune() once again.
 */
void classic_parse(Sentence sent, Parse_Options opts)
{
	fast_matcher_t * mchxt = NULL;
	count_context_t * ctxt = NULL;
	bool pp_and_power_prune_done = false;
	Disjunct **disjuncts_copy = NULL;
	bool is_null_count_0 = (0 == opts->min_null_count);
	int max_null_count = MIN((int)sent->length, opts->max_null_count);

	/* Build lists of disjuncts */
	prepare_to_parse(sent, opts);
	if (resources_exhausted(opts->resources)) return;

	if (is_null_count_0 && (0 < max_null_count))
	{
		/* Save the disjuncts in case we need to parse with null_count>0. */
		disjuncts_copy = alloca(sent->length * sizeof(Disjunct *));
		for (size_t i = 0; i < sent->length; i++)
			disjuncts_copy[i] = disjuncts_dup(sent->word[i].d);
	}

	for (int nl = opts->min_null_count; nl <= max_null_count; nl++)
	{
		Count_bin hist;
		s64 total;

		if (!pp_and_power_prune_done)
		{
			if (0 != nl)
			{
				pp_and_power_prune_done = true;
				if (is_null_count_0)
					opts->min_null_count = 1; /* Don't optimize for null_count==0. */

				/* We are parsing now with null_count>0, when previously we
				 * parsed with null_count==0. Restore the save disjuncts. */
				if (NULL != disjuncts_copy)
				{
					free_sentence_disjuncts(sent);
					for (size_t i = 0; i < sent->length; i++)
						sent->word[i].d = disjuncts_copy[i];
					disjuncts_copy = NULL;
				}
			}
			pp_and_power_prune(sent, opts);
			if (is_null_count_0) opts->min_null_count = 0;
			if (resources_exhausted(opts->resources)) break;

			free_count_context(ctxt, sent);
			free_fast_matcher(sent, mchxt);
			pack_sentence(sent);
			ctxt = alloc_count_context(sent);
			mchxt = alloc_fast_matcher(sent);
			print_time(opts, "Initialized fast matcher");
		}

		if (resources_exhausted(opts->resources)) break;
		free_linkages(sent);

		sent->null_count = nl;
		hist = do_parse(sent, mchxt, ctxt, sent->null_count, opts);
		total = hist_total(&hist);

		lgdebug(D_PARSE, "Info: Total count with %zu null links:   %lld\n",
		        sent->null_count, total);

		/* total is 64-bit, num_linkages_found is 32-bit. Clamp */
		total = (total > INT_MAX) ? INT_MAX : total;
		total = (total < 0) ? INT_MAX : total;

		sent->num_linkages_found = (int) total;
		print_time(opts, "Counted parses");

		extractor_t * pex = extractor_new(sent->length, sent->rand_state);
		bool ovfl = setup_linkages(sent, pex, mchxt, ctxt, opts);
		process_linkages(sent, pex, ovfl, opts);
		free_extractor(pex);

		post_process_lkgs(sent, opts);

		if (sent->num_valid_linkages > 0) break;
		if ((0 == nl) && (0 < max_null_count) && verbosity > 0)
			prt_error("No complete linkages found.\n");

		/* If we are here, then no valid linkages were found.
		 * If there was a parse overflow, give up now. */
		if (PARSE_NUM_OVERFLOW < total) break;
		//if (sent->num_linkages_found > 0 && nl>0) printf("NUM_LINKAGES_FOUND %d\n", sent->num_linkages_found);
	}
	sort_linkages(sent, opts);

	if (NULL != disjuncts_copy)
	{
		for (size_t i = 0; i < sent->length; i++)
			free_disjuncts(disjuncts_copy[i]);
	}
	free_count_context(ctxt, sent);
	free_fast_matcher(sent, mchxt);
}
Beispiel #28
0
void * object_open(const char *filename,
                   void * (*opencb)(const char *, const void *),
                   const void * user_data)
{
	/* Dictionary data directory path cache -- per-thread storage. */
	static TLS char *path_found;
	char *completename = NULL;
	void *fp = NULL;
	char *data_dir = NULL;
	const char **path = NULL;

	if (NULL == filename)
	{
		/* Invalidate the dictionary data directory path cache. */
		char *pf = path_found;
		path_found = NULL;
		free(pf);
		return NULL;
	}

	if (NULL == path_found)
	{
		data_dir = dictionary_get_data_dir();
		if (verbosity_level(D_USER_FILES))
		{
			char cwd[MAX_PATH_NAME];
			char *cwdp = getcwd(cwd, sizeof(cwd));
			prt_error("Debug: Current directory: %s\n", NULL == cwdp ? "NULL": cwdp);
			prt_error("Debug: Last-resort data directory: %s\n",
					  data_dir ? data_dir : "NULL");
		}
	}

	/* Look for absolute filename.
	 * Unix: starts with leading slash.
	 * Windows: starts with C:\  except that the drive letter may differ. */
	if ((filename[0] == '/')
#ifdef _WIN32
		|| ((filename[1] == ':')
			 && ((filename[2] == '\\') || (filename[2] == '/')))
		|| (filename[0] == '\\') /* UNC path */
#endif /* _WIN32 */
	   )
	{
		/* opencb() returns NULL if the file does not exist. */
		fp = opencb(filename, user_data);
		lgdebug(D_USER_FILES, "Debug: Opening file %s%s\n", filename, NOTFOUND(fp));
	}
	else
	{
		/* A path list in which to search for dictionaries.
		 * path_found, data_dir or DEFAULTPATH may be NULL. */
		const char *dictpath[] =
		{
			path_found,
			".",
			"." DIR_SEPARATOR "data",
			"..",
			".." DIR_SEPARATOR "data",
			data_dir,
			DEFAULTPATH,
		};
		size_t i = sizeof(dictpath)/sizeof(dictpath[0]);

		for (path = dictpath; i-- > 0; path++)
		{
			if (NULL == *path) continue;

			free(completename);
			completename = join_path(*path, filename);
			fp = opencb(completename, user_data);
			lgdebug(D_USER_FILES, "Debug: Opening file %s%s\n", completename, NOTFOUND(fp));
			if ((NULL != fp) || (NULL != path_found)) break;
		}
	}

	if (NULL == fp)
	{
		fp = opencb(filename, user_data);
		lgdebug(D_USER_FILES, "Debug: Opening file %s%s\n", filename, NOTFOUND(fp));
	}
	else if (NULL == path_found)
	{
		char *pfnd = strdup((NULL != completename) ? completename : filename);
		if ((0 < verbosity) && (dict_file_open == opencb))
			prt_error("Info: Dictionary found at %s\n", pfnd);
		for (size_t i = 0; i < 2; i++)
		{
			char *root = strrchr(pfnd, DIR_SEPARATOR[0]);
			if (NULL != root) *root = '\0';
		}
		path_found = pfnd;
	}

	free(data_dir);
	free(completename);
	return fp;
}
Beispiel #29
0
static int callout(pcre_callout_block *cb)
{
	callout_data_t *cd = cb->callout_data;
	ov_t *cb_ov = (ov_t *)&cb->offset_vector[2*cb->capture_last];

#if 0
	const char **wordlist = NULL;
#endif
	cgnum_t *pcgnum = NULL;
	const char *openp;
	const char *endname;
	bool subp_updated = false;

	if ((NULL != cd->cgnum) && (-1 != cb->capture_last))
	{
		pcgnum = cd->cgnum[cb->capture_last-1];
	}
	lgdebug(6, "Callout %d: capture_last %d cgnum %p\n",
	        cb->callout_number, cb->capture_last, pcgnum);

	if (verbosity >= 6)
		printov(cb->subject, (ov_t *)cb->offset_vector, cb->capture_top, cd, /*is_pcreov*/true);

	switch(cb->callout_number)
	{
	case CALLBACK_REP:
		if (cb->capture_last > 0)
		{
			int subp_i = cd->subp_i;
			ov_t *subp = &cd->subp[subp_i];

			lgdebug(2, "Current capture %d: s=%d, e=%d\n",
			        cb->capture_last, cb_ov->s, cb_ov->e);
			assert(cb_ov->s>=0 && cb_ov->e>=0, "Bad start/end in capture group %d: s=%d e=%d",
			       cb->capture_last, cb_ov->s, cb_ov->e);

			if (verbosity >= 6)
			{
				printf("INITIAL subp:\n");
				if (cd->subp_ovfl) printf("OVERFLOW\n"); /* shouldn't happen */
				printov(cb->subject, cd->subp, cd->subp_i+1, cd, /*is_pcreov*/false);
			}

			/* Record all the captures into the subp (sub-pattern) vector.
			 * If we capture a continuation to another capture then it is a new
			 * capture. Else we update a previous position in subp. There should be
			 * no gaps between the capture strings.
			 * FIXME: Handled null matches properly. Need to use cd->capture_level
			 * to remember at which level a null match has been captured.
			 * FIXME: Move after the word lookup (efficiency).
			 * FIXME: Increment subp instead of cd->subp_i (cosmetic fix). */

			if (cb_ov->s > subp->s)
			{
				if (cb_ov->s == subp->e)
				{
					cd->subp_i++;
					if (cd->subp_i == MAX_SUBP)
					{
						cd->subp_ovfl = true;
						return PCRE_ERROR_CALLOUT;
					}
					lgdebug(2, "OV start gt, update next sub-pattern %d\n", cd->subp_i);
					cd->subp[cd->subp_i] = *cb_ov;
					subp_updated = true;
				}
				else
				{
					printf("Capture group %d (s=%d e=%d) makes a hole (subp_i %d: s=%d e=%d)\n",
							 cb->capture_last, subp->s, subp->e, subp_i, cb_ov->s, cb_ov->e);
					return PCRE_ERROR_CALLOUT;
				}
			}
			else
			{
				/* A backtrack occurred. */
				for (subp_i = cd->subp_i; subp_i >= 0; subp_i--)
				{
					subp = &cd->subp[subp_i];

					lgdebug(2, "Checking recorded sub-pattern %d: s=%d e=%d: ",
							  subp_i, subp->s,  subp->e);

					if (cb_ov->s == subp->s)
					{
						lgdebug(2, "OV start eq, update sub-pattern %d\n", subp_i);
						*subp = *cb_ov;
						cd->subp_i = subp_i;
						subp_updated = true;
						break;
					}
					lgdebug(2, "Backtrack handling\n");
				}
			}
			assert(subp_i >= 0, "Recorded sub-pattern index");
			assert(subp_updated);
			cd->capture_level[cd->subp_i] = cb->capture_last;

			if (verbosity >= 6)
			{
				printf("AFTER: subp:\n");
				if (cd->subp_ovfl) printf("OVERFLOW\n"); /* shouldn't happen */
				printov(cb->subject, cd->subp, cd->subp_i+1, cd, /*is_pcreov*/false);
			}

			/* Make a dictionary lookup for NAME in capture groups (?<NAME>x)
			 * (x is a constraint for the initial pattern-match comparison done by
			 * PCRE). */
			 // if (pcgnum && * cd->is_constant) printf("is_constant\n");

			/* If we have a cgnum structure with a dict, check if the string to be
			 * matched is in the dict or belongs to the given affix class.
			 * A NULL cgnum->dict means this is a regex from the regex file. */

			if (pcgnum && pcgnum->dict)
			{  /* && !cd->is_constant */
				int numchar = cb_ov->e - cb_ov->s;

				/* Debug: Sanity check. */
				assert(numchar>=0, "numchar=%d", numchar);
				endname = NULL;
				for (openp = &cd->pattern[cb->pattern_position-5]; *openp; openp--)
				{
					if (*openp == '>') endname = openp;
					if (*openp == '(' && openp[1] == '?' && openp[2] == '<' && openp[3] != '=') break;
				}
				if (NULL != openp && *openp == '(' && NULL != endname && strncmp(openp, "(?<", 3) == 0 && endname > openp)
					; /* Everything is OK. */
				else
				{
					assert(0, "Error: Not in a named group!");
				}
				lgdebug(6, "GROUP NAME %.*s, cgnum %d, ptr %p, numchar %d\n",
						  (int)(endname - openp - 3), openp+3, cb->capture_last-1, pcgnum, numchar);
				/* End of debug sanity check. */

				lgdebug(2, "Try match '%.*s': ", numchar, cb->subject+cb_ov->s);

#if 0
				if (0 == numchar)
				{
					lgdebug(2, "Zero match denied\n");
					return 1;
				}
#endif

				if (!is_word(cb->subject+cb_ov->s, numchar, pcgnum))
				{
						lgdebug(2, "NO MATCH\n");
						return 1;
				}
				lgdebug(6, "MATCH\n");
			}
		}
#if 0
		if (verbosity >= 6)
		{
			printf("DEBUG subp:\n");
			if (cd->subp_ovfl) printf("OVERFLOW\n"); /* shouldn't happen */
			printov(cb->subject, cd->subp, cd->subp_i+1, cd);
		}
#endif

		// cd->is_constant = false;
		return 0; /* continue to match the rest of the regex */
		break;

#if 0
	case CALLBACK_CONSTANT_START:
		// cd->is_constant = true;
		return 0;
		break;

	case CALLBACK_CONSTANT_END:
		// cd->is_constant = false;
		return 0;
		break;
#endif

	case CALLBACK_END:
		cd->alt_counter++;
		printf("Alternative %d:\n", cd->alt_counter);
		/* See the comment for SUBP0END_DEBUG_SIGNATURE. */
		assert(cd->subp[0].e>=0, "subp[0].e is %d!", cd->subp[0].e);
		printov(cb->subject, cd->subp, cd->subp_i+1, cd, /*is_pcreov*/false);

		/* Remove the last sub-pattern, in case it is a null string (no need to
		 * check, it can be removed anyway since if it is not a null string it is
		 * going to be replaced on the next match). Else the next match, which
		 * will be without this null string, we emit it again as the last
		 * sub-pattern component. FIXME: It doesn't always help. */

		if (cd->subp_i > 0)
		{
			cd->capture_level[cd->subp_i] = -3; /* mark as invalid, for debug */
			cd->subp_i--;
		}

		// cd->is_constant = false;
		return 1; /* signify a backtrack in order to find the next alternative */
		break;

	default:
		assert("Callout: Unreached" && 0);
	}

	return 0; /* Really unreached. */

/*
	printf("Callout %d, data test %d\n"
	       "version %d\n"
			 "subject '%s\n"
			 "subject_length %d\n"
			 "start_match %d\n"
			 "current_position %d\n"
			 "capture_top %d\n"
			 "capture_last %d\n"
			 "pattern_position %d\n"
			 "next_item_length %d\n",
			 cb->callout_number, ((callout_data *)cb->callout_data)->test,
			 cb->version, cb->subject, cb->subject_length, cb->start_match,

			 cb->current_position,
			 cb->capture_top,
			 cb->capture_last,

			 cb->pattern_position,
			 cb->next_item_length);
	return 0;
*/

}
Beispiel #30
0
static int pp_prune(Sentence sent, Parse_Options opts)
{
	pp_knowledge *knowledge;
	multiset_table *cmt;

	if (sent->postprocessor == NULL) return 0;
	if (!opts->perform_pp_prune) return 0;

	knowledge = sent->postprocessor->knowledge;
	cmt = cms_table_new();

	jet_sharing_t *js = &sent->jet_sharing;
	if (js->table[0] != NULL)
	{
		for (int dir = 0; dir < 2; dir++)
		{
			for (unsigned int id = 1; id < js->entries[dir] + 1; id++)
			{
				for (Connector *c = js->table[dir][id].c; NULL != c; c = c->next)
				{
					if (0 == c->refcount) continue;
					insert_in_cms_table(cmt, c);
				}
			}
		}
	}
	else
	{
		for (WordIdx w = 0; w < sent->length; w++)
		{
			for (Disjunct *d = sent->word[w].d; d != NULL; d = d->next)
			{
				for (int dir = 0; dir < 2; dir++)
				{
					Connector *first_c = (dir) ? (d->left) : (d->right);
					for (Connector *c = first_c; c != NULL; c = c->next)
					{
						insert_in_cms_table(cmt, c);
					}
				}
			}
		}
	}

	int D_deleted = 0;       /* Number of deleted disjuncts */
	int Cname_deleted = 0;   /* Number of deleted connector names */

	/* Since the cms table is unchanged, after applying a rule once we
	 * know if it will be TRUE or FALSE if we need to apply it again.
	 * Values: -1: Undecided yet; 0: Rule unsatisfiable; 1 Rule satisfiable. */
	uint8_t *rule_ok = alloca(knowledge->n_contains_one_rules * sizeof(bool));
	memset(rule_ok, -1, knowledge->n_contains_one_rules * sizeof(bool));

	for (size_t i = 0; i < knowledge->n_contains_one_rules; i++)
	{
		if (rule_ok[i] == 1) continue;

		pp_rule* rule = &knowledge->contains_one_rules[i]; /* The ith rule */
		const char *selector = rule->selector;  /* Selector string for this rule */
		pp_linkset *link_set = rule->link_set;  /* The set of criterion links */
		unsigned int hash = cms_hash(selector);

		if (rule->selector_has_wildcard)
		{
			rule_ok[i] = 1;
			continue;  /* If it has a * forget it */
		}

		for (Cms *cms = cmt->cms_table[hash]; cms != NULL; cms = cms->next)
		{
			Connector *c = cms->c;
			if (!post_process_match(selector, connector_string(c))) continue;

			ppdebug("Rule %zu: Selector %s, Connector %s\n",
			        i, selector, connector_string(c));
			/* We know c matches the trigger link of the rule. */
			/* Now check the criterion links */
			if ((rule_ok[i] == 0) || !rule_satisfiable(cmt, link_set))
			{
				rule_ok[i] = 0;
				ppdebug("DELETE %s refcount %d\n", connector_string(c), c->refcount);
				c->nearest_word = BAD_WORD;
				Cname_deleted++;
				rule->use_count++;
			}
			else
			{
				rule_ok[i] = 1;
				break;
			}
		}
	}

	/* Iterate over all connectors and mark the bad trigger connectors.
	 * If the marked connector is not the shallow one, note that the
	 * shallow one on the same disjunct cannot be marked too (this could
	 * facilitate faster detection by power_prune()) because this would be
	 * wrongly reflected through the cms table. */

	if (js->table[0] != NULL)
	{
		for (int dir = 0; dir < 2; dir++)
		{
			for (unsigned int id = 1; id < js->entries[dir] + 1; id++)
			{
				for (Connector *c = js->table[dir][id].c; NULL != c; c = c->next)
				{
					if (0 == c->refcount) continue;
					if (mark_bad_connectors(cmt, c))
					{
						D_deleted++;
						break;
					}
				}
			}
		}
	}
	else
	{
		for (WordIdx w = 0; w < sent->length; w++)
		{
			for (Disjunct *d = sent->word[w].d; d != NULL; d = d->next)
			{
				for (int dir = 0; dir < 2; dir++)
				{
					Connector *first_c = (dir) ? (d->left) : (d->right);
					for (Connector *c = first_c; c != NULL; c = c->next)
					{
						if (mark_bad_connectors(cmt, c))
						{
							D_deleted++;
							break;
						}
					}
				}

			}
		}
	}

	lgdebug(+D_PRUNE, "Deleted %d (%d connector names)\n",
	        D_deleted, Cname_deleted);

	cms_table_delete(cmt);

	print_time(opts, "pp pruning");

	return D_deleted;
}