Exemple #1
0
/* public functions */
xtree_t xtree_new(int base, int prime)
{
	xtree_t xnew;
	pool_t p;

	p = pool_new();
	xnew = pmalloc(p, sizeof(xtree_st));
	xnew->p = p;
	xnew->base = (base ? base : 0xf422f);
	xnew->prime = (prime ? prime :  31);
	xnew->count = 0;
	xnew->trees = (node_t *) pmalloc_z(p, sizeof(node_t) * xnew->prime);
	return xnew;
}
Exemple #2
0
Fichier : scws.c Projet : 9466/scws
static void _scws_msegment(scws_t s, int end, int zlen)
{
	word_t **wmap, query;
	struct scws_zchar *zmap;
	unsigned char *txt;
	rule_item_t r1;
	int i, j, k, ch, clen, start;
	pool_t p;

	/* pool used to management some dynamic memory */
	p = pool_new();

	/* create wmap & zmap */
	wmap = s->wmap = (word_t **) darray_new(zlen, zlen, sizeof(word_t));
	zmap = s->zmap = (struct scws_zchar *) pmalloc(p, zlen * sizeof(struct scws_zchar));
	txt = s->txt;
	start = s->off;
	s->zis = -1;

	for (i = 0; start < end; i++)
	{
		ch = txt[start];
		clen = SCWS_CHARLEN(ch);
		if (clen == 1)
		{
			while (start++ < end)
			{
				ch = txt[start];
				if (start == end || SCWS_CHARLEN(txt[start]) > 1)
					break;
				clen++;
			}
			wmap[i][i] = (word_t) pmalloc_z(p, sizeof(word_st));
			wmap[i][i]->tf = 0.5;
			wmap[i][i]->flag |= SCWS_ZFLAG_ENGLISH;
			strcpy(wmap[i][i]->attr, SCWS_IS_ALPHA(txt[start-1]) ? attr_en : attr_un);
		}
		else
		{
			query = xdict_query(s->d, txt + start, clen);
			wmap[i][i] = (word_t) pmalloc(p, sizeof(word_st));
			if (query == NULL)
			{
				wmap[i][i]->tf = 0.5;
				wmap[i][i]->idf = 0.0;
				wmap[i][i]->flag = 0;
				strcpy(wmap[i][i]->attr, attr_un);
			}
			else
			{
				ch = query->flag;
				query->flag = SCWS_WORD_FULL;
				memcpy(wmap[i][i], query, sizeof(word_st));
				if (query->attr[0] == '#')
					wmap[i][i]->flag |= SCWS_ZFLAG_SYMBOL;

				if (ch & SCWS_WORD_MALLOCED)
					free(query);							
			}
			start += clen;
		}
		
		zmap[i].start = start - clen;
		zmap[i].end = start;
	}

	/* fixed real zlength */
	zlen = i;

	/* create word query table */
	for (i = 0; i < zlen; i++)
	{
		k = 0;
		for (j = i+1; j < zlen; j++)
		{
			query = xdict_query(s->d, txt + zmap[i].start, zmap[j].end - zmap[i].start);
			if (query == NULL)
				break;
			ch = query->flag;
			if ((ch & SCWS_WORD_FULL) && memcmp(query->attr, attr_na, 2))
			{
				wmap[i][j] = (word_t) pmalloc(p, sizeof(word_st));
				memcpy(wmap[i][j], query, sizeof(word_st));

				wmap[i][i]->flag |= SCWS_ZFLAG_WHEAD;

				for (k = i+1; k <= j; k++)
					wmap[k][k]->flag |= SCWS_ZFLAG_WPART;
			}

			if (ch & SCWS_WORD_MALLOCED)
				free(query);

			if (!(ch & SCWS_WORD_PART))
				break;		
		}
		
		if (k--)
		{
			/* set nr2 to some short name */
			if ((k == (i+1)))
			{
				if (!memcmp(wmap[i][k]->attr, attr_nr, 2))
					wmap[i][i]->flag |= SCWS_ZFLAG_NR2;
				//if (wmap[i][k]->attr[0] == 'n')
					//wmap[i][i]->flag |= SCWS_ZFLAG_N2;
			}				

			/* clean the PART flag for the last word */
			if (k < j)
				wmap[i][k]->flag ^= SCWS_WORD_PART;
		}
	}

	if (s->r == NULL)
		goto do_segment;
	
	/* auto rule set for name & zone & chinese numeric */

	/* one word auto rule check */
	for (i = 0; i < zlen; i++)
	{
		if (SCWS_NO_RULE1(wmap[i][i]->flag))
			continue;

		r1 = scws_rule_get(s->r, txt + zmap[i].start, zmap[i].end - zmap[i].start);
		if (r1 == NULL)
			continue;

		clen = r1->zmin > 0 ? r1->zmin : 1;
		if ((r1->flag & SCWS_ZRULE_PREFIX) && (i < (zlen - clen)))
		{			
			/* prefix, check after (zmin~zmax) */
			// 先检查 zmin 字内是否全部符合要求
			// 再在 zmax 范围内取得符合要求的字
			// int i, j, k, ch, clen, start;
			for (ch = 1; ch <= clen; ch++)
			{
				j = i + ch;
				___ZRULE_CHECKER1___
				___ZRULE_CHECKER3___
			}

			if (ch <= clen)
				continue;

			/* no limit znum or limit to a range */
			j = i + ch;
			while (1)
			{
				if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
					break;
				___ZRULE_CHECKER1___
				___ZRULE_CHECKER3___
				clen++;
				j++;
			}

			// 注意原来2字人名,识别后仍为2字的情况
			if (wmap[i][i]->flag & SCWS_ZFLAG_NR2)
			{
				if (clen == 1)
					continue;
				wmap[i][i+1]->flag |= SCWS_WORD_PART;
			}
			
			/* ok, got: i & clen */
			k = i + clen;
			wmap[i][k] = (word_t) pmalloc(p, sizeof(word_st));
			wmap[i][k]->tf = r1->tf;
			wmap[i][k]->idf = r1->idf;
			wmap[i][k]->flag = (SCWS_WORD_RULE|SCWS_WORD_FULL);
			strncpy(wmap[i][k]->attr, r1->attr, 2);

			wmap[i][i]->flag |= SCWS_ZFLAG_WHEAD;
			for (j = i+1; j <= k; j++)			
				wmap[j][j]->flag |= SCWS_ZFLAG_WPART;

			if (!(wmap[i][i]->flag & SCWS_ZFLAG_WPART))
				i = k;

			continue;
		}
		
		if ((r1->flag & SCWS_ZRULE_SUFFIX) && (i >= clen))
		{
			/* suffix, check before */
			for (ch = 1; ch <= clen; ch++)
			{
				j = i - ch;
				___ZRULE_CHECKER2___
				___ZRULE_CHECKER3___
			}
			
			if (ch <= clen)
				continue;

			/* no limit znum or limit to a range */
			j = i - ch;
			while (1)
			{
				if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
					break;
				___ZRULE_CHECKER2___
				___ZRULE_CHECKER3___
				clen++;
				j--;
			}

			/* ok, got: i & clen (maybe clen=1 & [k][i] isset) */
			k = i - clen;
			if (wmap[k][i] != NULL)
				continue;

			wmap[k][i] = (word_t) pmalloc(p, sizeof(word_st));
			wmap[k][i]->tf = r1->tf;
			wmap[k][i]->idf = r1->idf;
			wmap[k][i]->flag = SCWS_WORD_FULL;
			strncpy(wmap[k][i]->attr, r1->attr, 2);

			wmap[k][k]->flag |= SCWS_ZFLAG_WHEAD;
			for (j = k+1; j <= i; j++)
			{
				wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
				if ((j != i) && (wmap[k][j] != NULL))
					wmap[k][j]->flag |= SCWS_WORD_PART;
			}
			continue;
		}
	}

	/* two words auto rule check (欧阳** , **西路) */
	for (i = zlen - 2; i >= 0; i--)
	{
		/* with value ==> must be have SCWS_WORD_FULL, so needn't check it ag. */
		if ((wmap[i][i+1] == NULL) || (wmap[i][i+1]->flag & SCWS_WORD_PART))
			continue;

		k = i+1;
		r1 = scws_rule_get(s->r, txt + zmap[i].start, zmap[k].end - zmap[i].start);
		if (r1 == NULL)
			continue;		

		clen = r1->zmin > 0 ? r1->zmin : 1;
		if ((r1->flag & SCWS_ZRULE_PREFIX) && (k < (zlen - clen)))
		{
			for (ch = 1; ch <= clen; ch++)
			{
				j = k + ch;
				___ZRULE_CHECKER1___
				___ZRULE_CHECKER3___
			}

			if (ch <= clen)
				continue;

			/* no limit znum or limit to a range */
			j = k + ch;
			while (1)
			{
				if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
					break;
				___ZRULE_CHECKER1___
				___ZRULE_CHECKER3___
				clen++;
				j++;
			}

			/* ok, got: i & clen */
			k = k + clen;
			wmap[i][k] = (word_t) pmalloc(p, sizeof(word_st));
			wmap[i][k]->tf = r1->tf;
			wmap[i][k]->idf = r1->idf;
			wmap[i][k]->flag = SCWS_WORD_FULL;
			strncpy(wmap[i][k]->attr, r1->attr, 2);

			wmap[i][i+1]->flag |= SCWS_WORD_PART;
			for (j = i+2; j <= k; j++)			
				wmap[j][j]->flag |= SCWS_ZFLAG_WPART;

			i--;
			continue;
		}

		if ((r1->flag & SCWS_ZRULE_SUFFIX) && (i >= clen))
		{
			/* suffix, check before */
			for (ch = 1; ch <= clen; ch++)
			{
				j = i - ch;
				___ZRULE_CHECKER2___
				___ZRULE_CHECKER3___
			}
			
			if (ch <= clen)
				continue;

			/* no limit znum or limit to a range */
			j = i - ch;
			while (1)
			{
				if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax)))
					break;
				___ZRULE_CHECKER2___
				___ZRULE_CHECKER3___
				clen++;
				j--;
			}

			/* ok, got: i & clen (maybe clen=1 & [k][i] isset) */
			k = i - clen;
			i = i + 1;
			wmap[k][i] = (word_t) pmalloc(p, sizeof(word_st));
			wmap[k][i]->tf = r1->tf;
			wmap[k][i]->idf = r1->idf;
			wmap[k][i]->flag = SCWS_WORD_FULL;
			strncpy(wmap[k][i]->attr, r1->attr, 2);

			wmap[k][k]->flag |= SCWS_ZFLAG_WHEAD;
			for (j = k+1; j <= i; j++)
			{
				wmap[j][j]->flag |= SCWS_ZFLAG_WPART;
				if (wmap[k][j] != NULL)
					wmap[k][j]->flag |= SCWS_WORD_PART;
			}

			i -= (clen+1);
			continue;
		}
	}

	/* real do the segment */
do_segment:

	/* find the easy break point */
	for (i = 0, j = 0; i < zlen; i++)
	{
		if (wmap[i][i]->flag & SCWS_ZFLAG_WPART)
			continue;

		if (i > j)
			_scws_mseg_zone(s, j, i-1);

		j = i;
		if (!(wmap[i][i]->flag & SCWS_ZFLAG_WHEAD))
		{
			_scws_mset_word(s, i, i);
			j++;
		}
	}

	/* the lastest zone */
	if (i > j)
		_scws_mseg_zone(s, j, i-1);

	/* the last single for duality */
	if ((s->mode & SCWS_DUALITY) && (s->zis >= 0) && !(s->zis & SCWS_ZIS_USED))	
	{
		i = s->zis;
		SCWS_PUT_RES(s->zmap[i].start, s->wmap[i][i]->idf, (s->zmap[i].end - s->zmap[i].start), s->wmap[i][i]->attr);
	}

	/* free the wmap & zmap */
	pool_free(p);
	darray_free((void **) wmap);
}
Exemple #3
0
/* open the text dict */
static xdict_t _xdict_open_txt(const char *fpath, int mode, unsigned char *ml)
{
	xdict_t xd;
	xtree_t xt;
	char buf[XDICT_PATH_MAX], tmpfile[XDICT_PATH_MAX];
	struct stat st1, st2;

	// check the input filepath
	_realpath(fpath, buf);
	if (stat(buf, &st1) < 0)
		return NULL;

	// check dest file & orginal file, compare there mtime
#ifdef WIN32
	{
		char *tmp_ptr;
		GetTempPath(sizeof(tmpfile) - 20, tmpfile);
		tmp_ptr = tmpfile + strlen(tmpfile);
		if (tmp_ptr[-1] == '\\') tmp_ptr--;
		sprintf(tmp_ptr, "\\scws-%08x.xdb", scws_crc32(buf));
	}
#else
	sprintf(tmpfile, "/tmp/scws-%08x.xdb", scws_crc32(buf));
#endif
	if (!stat(tmpfile, &st2) && st2.st_mtime > st1.st_mtime)
	{
		xdb_t x;
		if ((x = xdb_open(tmpfile, 'r')) != NULL)
		{
			xd = (xdict_t) malloc(sizeof(xdict_st));
			memset(xd, 0, sizeof(xdict_st));
			xd->ref = 1;

			if (mode & SCWS_XDICT_MEM)
			{
				/* convert the xdb(disk) -> xtree(memory) */
				if ((xt = xdb_to_xtree(x, NULL)) != NULL)
				{
					xdb_close(x);
					xd->xdict = (void *) xt;
					xd->xmode = SCWS_XDICT_MEM;
					return xd;
				}
			}
			xd->xmode = SCWS_XDICT_XDB;
			xd->xdict = (void *) x;
			return xd;
		}
	}

	// create xtree
	if ((xt = xtree_new(0, 0)) == NULL)
		return NULL;
	else
	{
		int cl, kl;
		FILE *fp;
		word_st word, *w;
		char *key, *part, *last, *delim = " \t\r\n";

		// re-build the xdb file from text file	
		if ((fp = fopen(buf, "r")) == NULL)
			return NULL;

		// parse every line
		word.attr[2] = '\0';
		while (fgets(buf, sizeof(buf) - 1, fp) != NULL)
		{
			// <word>[\t<tf>[\t<idf>[\t<attr>]]]		
			if (buf[0] == ';' || buf[0] == '#') continue;

			key = _strtok_r(buf, delim, &last);
			if (key == NULL) continue;
			kl = strlen(key);

			// init the word
			do
			{
				word.tf = word.idf = 1.0;
				word.flag = SCWS_WORD_FULL;
				word.attr[0] = '@';
				word.attr[1] = '\0';

				if (!(part = _strtok_r(NULL, delim, &last))) break;
				word.tf = (float) atof(part);

				if (!(part = _strtok_r(NULL, delim, &last))) break;
				word.idf = (float) atof(part);

				if ((part = _strtok_r(NULL, delim, &last)))
				{
					word.attr[0] = part[0];
					if (part[1]) word.attr[1] = part[1];
				}
			}
			while (0);

			// save into xtree
			if ((w = xtree_nget(xt, key, kl, NULL)) == NULL)
			{
				w = (word_st *) pmalloc(xt->p, sizeof(word_st));
				memcpy(w, &word, sizeof(word));
				xtree_nput(xt, w, sizeof(word), key, kl);
			}
			else
			{
				w->tf = word.tf;
				w->idf = word.idf;
				w->flag |= word.flag;
				strcpy(w->attr, word.attr);
			}

			// parse the part	
			cl = ml[(unsigned char) (key[0])];
			while (1)
			{
				cl += ml[(unsigned char) (key[cl])];
				if (cl >= kl) break;

				if ((w = xtree_nget(xt, key, cl, NULL)) != NULL)
					w->flag |= SCWS_WORD_PART;
				else
				{
					w = (word_st *) pmalloc_z(xt->p, sizeof(word_st));
					w->flag = SCWS_WORD_PART;
					xtree_nput(xt, w, sizeof(word), key, cl);
				}
			}
		}
		fclose(fp);

		// optimize the xtree & save to xdb
		xtree_optimize(xt);
		unlink(tmpfile);
		xtree_to_xdb(xt, tmpfile);
		chmod(tmpfile, 0777);

		// return xtree
		xd = (xdict_t) malloc(sizeof(xdict_st));
		memset(xd, 0, sizeof(xdict_st));
		xd->ref = 1;
		xd->xdict = (void *) xt;
		xd->xmode = SCWS_XDICT_MEM;
		return xd;
	}
}