예제 #1
0
void Scws::GetResult(string content
	,vector<SCWSRESULT> &v)
{  
	scws_res_t res, cur;
	scws_send_text(s,content.c_str()
		,content.length());
	
	while (res = cur = scws_get_result(s))
	{
		while (cur != NULL)
		{
			//add the result into the vector
			SCWSRESULT result;
			result.word = string(content,cur->off
				,cur->len);
			result.weight = (int)cur->idf;
			v.push(result);
			
			//offset the cur
			cur = cur->next;
		}
		scws_free_result(res);
	}
	scws_free(s);
}
예제 #2
0
static PyObject * scws_get_res(PyObject * self,PyObject * args){
    const char *text;
    int sts;

    if (!PyArg_ParseTuple(args, "s",&text))
        return NULL;

    scws_res_t res, cur;
    scws_send_text(s, text, strlen(text));
    PyObject * v;
    int i = 0;
    int total = 0;
    long idf;
    scws_res_t head;
    v = PyList_New(0);
    double d;
    while (res = cur = scws_get_result(s))
    {
        while(cur != NULL){
            PyList_Append(v,Py_BuildValue("(O,O,d)",
                PyString_FromStringAndSize(text+cur->off,cur->len),
                PyString_FromString(cur->attr),
                cur->idf));
            cur = cur->next;
        }
    }
    scws_free_result(res);
    return Py_BuildValue("O",v);
}
예제 #3
0
Datum
zhprs_getlexeme(PG_FUNCTION_ARGS)
{
	ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);
	char	  **t = (char **) PG_GETARG_POINTER(1);
	int		   *tlen = (int *) PG_GETARG_POINTER(2);
	int			type = -1;

	if (pst->curr == NULL)
		pst->res = pst->curr = scws_get_result(pst->scws);

	/* already done the work, or no sentence */
	if (pst->res == NULL)
	{
		*tlen = 0;
		type = 0;
	}
	/* have results */
	else if (pst->curr != NULL)
	{
		scws_res_t curr = pst->curr;

		/*
		* check the first char to determine the lextype
		* if out of [0,25],then set to 'x',mean unknown type
		* so for Ag,Dg,Ng,Tg,Vg,the type will be unknown
		* for full attr explanation,visit http://www.xunsearch.com/scws/docs.php#attr
		*/
		type = (int)(curr->attr)[0];
		if (type > (int)'x' || type < (int)'a')
			type = (int)'x';
		*tlen = curr->len;
		*t = pst->buffer + curr->off;

		pst->curr = curr->next;

		/* clear for the next calling */
		if (pst->curr == NULL)
		{
			scws_free_result(pst->res);
			pst->res = NULL;
		}
	}

	PG_RETURN_INT32(type);
}
예제 #4
0
void scws_send_text_AS3()
{
	char *text = NULL;
	AS3_MallocString(text, inputString);
	scws_send_text(s, text, strlen(text));
	AS3_DeclareVar(myString, String);

	//char *result;
	//result[0] = '\0';   // ensures the memory is an empty string
	char result[5000]={"0"};
    char temp[1000]={'\0'};
	printf("%s",result);
	while (res = cur = scws_get_result(s))
	{
		while (cur != NULL)
		{
			printf("WORD: %.*s/%s (IDF = %4.2f)\n", cur->len, text+cur->off, cur->attr, cur->idf);
			//if((result = malloc(strlen(result)+ cur->len +1)) != NULL){
			//if((result = (char*) realloc(strlen(result)+ (cur->len) +1)) != NULL)
            strncpy(temp, text+cur->off, cur->len);
			temp[(cur->len)+1]='\0';
			strcat(result, temp);
			strcat(result, ' ');
			strcat(result, '\0');
            //strncpy(new_str,str2);
            //} else {
            //printf("malloc failed!\n");
            // exit?
            //}
			cur = cur->next;
		}
		scws_free_result(res);
	}
	strcat(result, '\0');
	
	printf("%s",result);
	AS3_CopyCStringToVar(myString, result, strlen(result));
	scws_free(s);
	//scws_free(result);
	AS3_Trace(myString);
	
	AS3_Return("212");
}
예제 #5
0
파일: pyscws.c 프로젝트: MOON-CLJ/pyscws
static PyObject* participle(Scws* self, PyObject* args){
    char *text;
    if(!PyArg_ParseTuple(args, "s", &text)){
        return NULL;
    }
    PyObject* result = PyList_New(0);
    scws_send_text(self->scws, text, strlen(text));
    scws_res_t res, cur;
    while ((cur = res = scws_get_result(self->scws)) != NULL){
        while (cur != NULL){
            PyObject* aword = PyList_New(2);
            PyObject* word_text = PyString_FromStringAndSize(text + cur->off, cur->len);
            PyObject* word_attr = PyString_FromString(cur->attr);
            PyList_SetItem(aword, 0, word_text);
            PyList_SetItem(aword, 1, word_attr);
            PyList_Append(result, aword);
            Py_DECREF(aword);
            cur = cur->next;
        }
        scws_free_result(res);
    }
    return result;
}
예제 #6
0
void updateUniverse()
{
  char *text = "Hello, 我名字叫李那曲是一个中国人, 我有时买Q币来玩, 我还听说过C#语言";

  if (!(s = scws_new())) {
    printf("ERROR: cann't init the scws!\n");
    //exit(-1);
  }
  scws_set_charset(s, "utf8");
  scws_set_dict(s, "dict.utf8.xdb", SCWS_XDICT_XDB);
  scws_set_rule(s, "rules.utf8.ini");

  scws_send_text(s, text, strlen(text));
  while (res = cur = scws_get_result(s))
  {
    while (cur != NULL)
    {
      printf("WORD: %.*s/%s (IDF = %4.2f)\n", cur->len, text+cur->off, cur->attr, cur->idf);
      cur = cur->next;
    }
    scws_free_result(res);
  }
  scws_free(s);
}
예제 #7
0
파일: scws.c 프로젝트: 9466/scws
scws_res_t scws_get_result(scws_t s)
{
	int off, len, ch, clen, zlen, pflag;
	unsigned char *txt;

	off = s->off;
	len = s->len;
	txt = s->txt;
	s->res0 = s->res1 = NULL;
	while ((off < len) && (txt[off] <= 0x20))
	{
		if (txt[off] == 0x0a || txt[off] == 0x0d)
		{
			s->off = off + 1;
			SCWS_PUT_RES(off, 0.0, 1, attr_un);
			return s->res0;
		}
		off++;
	}

	if (off >= len)
		return NULL;

	/* try to parse the sentence */
	s->off = off;
	ch = txt[off];
	if (SCWS_CHAR_TOKEN(ch) && !(s->mode & SCWS_IGN_SYMBOL))
	{
		s->off++;
		SCWS_PUT_RES(off, 0.0, 1, attr_un);
		return s->res0;
	}
	clen = SCWS_CHARLEN(ch);
	zlen = 1;
	pflag = (clen > 1 ? PFLAG_WITH_MB : (SCWS_IS_ALNUM(ch) ? PFLAG_ALNUM : 0));
	while ((off = (off+clen)) < len)
	{
		ch = txt[off];
		if (ch <= 0x20 || SCWS_CHAR_TOKEN(ch)) break;		
		clen = SCWS_CHARLEN(ch);
		if (!(pflag & PFLAG_WITH_MB))
		{
			// pure single-byte -> multibyte (2bytes)
			if (clen == 1)
			{
				if (pflag & PFLAG_ALNUM)
				{
					if (SCWS_IS_ALPHA(ch))
					{
						if (!(pflag & PFLAG_LONGALPHA) && SCWS_IS_ALPHA(txt[off-1]))
							pflag |= PFLAG_LONGALPHA;
					}
					else if (SCWS_IS_DIGIT(ch))
					{
						if (!(pflag & PFLAG_LONGDIGIT) && SCWS_IS_DIGIT(txt[off-1]))
							pflag |= PFLAG_LONGDIGIT;
					}
					else
						pflag ^= PFLAG_ALNUM;
				}
			}
			else
			{
				if (!(pflag & PFLAG_ALNUM) || zlen > 2)
					break;

				pflag |= PFLAG_WITH_MB;
				/* zlen = 1; */
			}
		}
		else if ((pflag & PFLAG_WITH_MB) && clen == 1)
		{
			int i;

			// mb + single-byte. allowd: alpha+num + 中文
			if (!SCWS_IS_ALNUM(ch))
				break;
			
			pflag &= ~PFLAG_VALID;
			// 夹在中文间的英文数字最多允许 2 个字符 (超过2可以独立成词没啥问题)
			for (i = off+1; i < (off+3); i++)
			{
				ch = txt[i];
				if ((i >= len) || (ch <= 0x20) || (SCWS_CHARLEN(ch) > 1))
				{
					pflag |= PFLAG_VALID;
					break;
				}

				if (!SCWS_IS_ALNUM(ch))
					break;
			}		
			
			if (!(pflag & PFLAG_VALID))
				break;

			clen += (i - off - 1);
		}
		/* hightman.070813: add max zlen limit */
		if (++zlen >= SCWS_MAX_ZLEN)
		    break;
	}

	/* hightman.070624: 处理半个字的问题 */
	if ((ch = off) > len)	
		off -= clen;

	/* do the real segment */
	if (off <= s->off)
		return NULL;
	else if (pflag & PFLAG_WITH_MB)
		_scws_msegment(s, off, zlen);
	else if (!(pflag & PFLAG_ALNUM) || ((off - s->off) >= SCWS_MAX_EWLEN))
		_scws_ssegment(s, off);
	else
	{
		zlen = off - s->off;
		if ((pflag & (PFLAG_LONGALPHA|PFLAG_LONGDIGIT)) == (PFLAG_LONGALPHA|PFLAG_LONGDIGIT))
			_scws_alnum_multi(s, s->off, zlen);
		else
		{
			float idf;

			idf = SCWS_EN_IDF(zlen);
			SCWS_PUT_RES(s->off, idf, zlen, attr_en);
		
			/* hightman.090523: 为字母数字混合再度拆解, 纯数字, (>1 ? 纯字母 : 数字+字母) */
			if ((s->mode & SCWS_MULTI_DUALITY) && zlen > 2)
				_scws_alnum_multi(s, s->off, zlen);
		}
	}

	/* reutrn the result */
	s->off = (ch > len ? len : off);
	if (s->res0 == NULL)
		return scws_get_result(s);

	return s->res0;
}