Esempio n. 1
0
static void _xdb_load_nodes(xdb_t x, xptr_t ptr, xcmper_st *nodes, int *count)
{
	int i;
	unsigned char buf[XDB_MAXKLEN + 18];

	if (ptr->len == 0)
		return;

	i = sizeof(buf)-1;
	if (i > (int)ptr->len)
		i = ptr->len;

	_xdb_read_data(x, buf, ptr->off, i);

	i = *count;
	nodes[i].ptr.off = ptr->off;
	nodes[i].ptr.len = ptr->len;
	nodes[i].key = (char *) _mem_ndup(buf + 17, buf[16]);
	*count = i+1;

	/* left & right */
	memcpy(ptr, buf, sizeof(xptr_st));
	_xdb_load_nodes(x, ptr, nodes, count);

	memcpy(ptr, buf + sizeof(xptr_st), sizeof(xptr_st));
	_xdb_load_nodes(x, ptr, nodes, count);
}
Esempio n. 2
0
File: scws.c Progetto: 9466/scws
static void _scws_ssegment(scws_t s, int end)
{
	int start, wlen, ch, pflag, ipflag = 0;
	unsigned char *txt;
	float idf;

	start = s->off;
	wlen = end - start;

	/* check special words (need strtoupper) */
	if (wlen > 1)
	{	
		txt = (char *) _mem_ndup(s->txt + start, wlen);	
		_str_toupper(txt, txt);
		if (SCWS_IS_SPECIAL(txt, wlen))
		{
			SCWS_PUT_RES(start, 9.5, wlen, "nz");
			free(txt);
			return;
		}
		free(txt);
	}

	txt = s->txt;	
	/* check brief words such as S.H.E M.R. */	
	if (SCWS_IS_ALPHA(txt[start]) && txt[start+1] == '.')
	{
		for (ch = start + 2; ch < end; ch++)
		{
			if (!SCWS_IS_ALPHA(txt[ch])) break;
			ch++;
			if (ch == end || txt[ch] != '.') break;
		}
		if (ch == end)
		{
			SCWS_PUT_RES(start, 7.5, wlen, "nz");
			return;
		}
	}

	/* 取出单词及标点. 数字允许一个点且下一个为数字,不连续的. 字母允许一个不连续的' */
	while (start < end)
	{
		ch = txt[start++];
		if (ipflag && ch != 0x2e && !SCWS_IS_DIGIT(ch))
			ipflag = 0;
		if (SCWS_IS_ALNUM(ch))
		{
			pflag = SCWS_IS_DIGIT(ch) ? PFLAG_DIGIT : 0;
			wlen = 1;
			while (start < end)
			{
				ch = txt[start];
				if (pflag & PFLAG_DIGIT)
				{
					if (!SCWS_IS_DIGIT(ch))
					{
						// check percent % = 0x25
						if (ch == 0x25 && !SCWS_IS_DIGIT(txt[start+1]))
						{
							start++;
							wlen++;
							break;
						}
						if (ipflag)
							break;
						// special for IP address or version number? (find out all digit + dot)
						if (ch == 0x2e && (pflag & PFLAG_ADDSYM))
						{
							ipflag = 1;
							while(--wlen && txt[--start] != 0x2e);
							pflag = 0;
							break;
						}
						// wlen = 1
						if (wlen == 1 && SCWS_IS_ALPHA(ch))
						{
							pflag ^= PFLAG_DIGIT;
							pflag |= PFLAG_ADDSYM;
							continue;
						}
						// strict must add: !$this->_is_digit(ord($this->txt[$start+1])))
						if ((pflag & PFLAG_ADDSYM) || !(ch == 0x2e && SCWS_IS_DIGIT(txt[start+1])))
							break;
						pflag |= PFLAG_ADDSYM;
					}
				}
				else
				{
					/* hightman.110419: - 出现在字母中间允许连接(0x2d), _ 允许连接(0x5f) */
					if ((ch == 0x2d || ch == 0x5f) && SCWS_IS_ALPHA(txt[start+1]))
						pflag |= PFLAG_ADDSYM;
					else if (!SCWS_IS_ALPHA(ch))
					{
						if ((pflag & PFLAG_ADDSYM)
							|| !((ch == 0x27 && SCWS_IS_ALPHA(txt[start+1]))
								|| (SCWS_IS_DIGIT(ch) && !SCWS_IS_DIGIT(txt[start+1]))))
						{
							break;
						}
						pflag |= PFLAG_ADDSYM;
					}
				}
				start++;
				wlen++;
				if (wlen >= SCWS_MAX_EWLEN)
					break;
			}
			idf = SCWS_EN_IDF(wlen);
			SCWS_PUT_RES(start-wlen, idf, wlen, attr_en);
			if ((s->mode & SCWS_MULTI_DUALITY) && (pflag & PFLAG_ADDSYM))
				_scws_alnum_multi(s, start-wlen, wlen);
		}
		else if (!(s->mode & SCWS_IGN_SYMBOL))
		{
			SCWS_PUT_RES(start-1, 0.0, 1, attr_un);
		}
	}
}