void Scws::GetResult(string content ,vector<SCWSRESULT> &v) { scws_res_t res, cur; scws_send_text(s,content.c_str() ,content.length()); while (res = cur = scws_get_result(s)) { while (cur != NULL) { //add the result into the vector SCWSRESULT result; result.word = string(content,cur->off ,cur->len); result.weight = (int)cur->idf; v.push(result); //offset the cur cur = cur->next; } scws_free_result(res); } scws_free(s); }
static PyObject * scws_get_res(PyObject * self,PyObject * args){ const char *text; int sts; if (!PyArg_ParseTuple(args, "s",&text)) return NULL; scws_res_t res, cur; scws_send_text(s, text, strlen(text)); PyObject * v; int i = 0; int total = 0; long idf; scws_res_t head; v = PyList_New(0); double d; while (res = cur = scws_get_result(s)) { while(cur != NULL){ PyList_Append(v,Py_BuildValue("(O,O,d)", PyString_FromStringAndSize(text+cur->off,cur->len), PyString_FromString(cur->attr), cur->idf)); cur = cur->next; } } scws_free_result(res); return Py_BuildValue("O",v); }
Datum zhprs_getlexeme(PG_FUNCTION_ARGS) { ParserState *pst = (ParserState *) PG_GETARG_POINTER(0); char **t = (char **) PG_GETARG_POINTER(1); int *tlen = (int *) PG_GETARG_POINTER(2); int type = -1; if (pst->curr == NULL) pst->res = pst->curr = scws_get_result(pst->scws); /* already done the work, or no sentence */ if (pst->res == NULL) { *tlen = 0; type = 0; } /* have results */ else if (pst->curr != NULL) { scws_res_t curr = pst->curr; /* * check the first char to determine the lextype * if out of [0,25],then set to 'x',mean unknown type * so for Ag,Dg,Ng,Tg,Vg,the type will be unknown * for full attr explanation,visit http://www.xunsearch.com/scws/docs.php#attr */ type = (int)(curr->attr)[0]; if (type > (int)'x' || type < (int)'a') type = (int)'x'; *tlen = curr->len; *t = pst->buffer + curr->off; pst->curr = curr->next; /* clear for the next calling */ if (pst->curr == NULL) { scws_free_result(pst->res); pst->res = NULL; } } PG_RETURN_INT32(type); }
void scws_send_text_AS3() { char *text = NULL; AS3_MallocString(text, inputString); scws_send_text(s, text, strlen(text)); AS3_DeclareVar(myString, String); //char *result; //result[0] = '\0'; // ensures the memory is an empty string char result[5000]={"0"}; char temp[1000]={'\0'}; printf("%s",result); while (res = cur = scws_get_result(s)) { while (cur != NULL) { printf("WORD: %.*s/%s (IDF = %4.2f)\n", cur->len, text+cur->off, cur->attr, cur->idf); //if((result = malloc(strlen(result)+ cur->len +1)) != NULL){ //if((result = (char*) realloc(strlen(result)+ (cur->len) +1)) != NULL) strncpy(temp, text+cur->off, cur->len); temp[(cur->len)+1]='\0'; strcat(result, temp); strcat(result, ' '); strcat(result, '\0'); //strncpy(new_str,str2); //} else { //printf("malloc failed!\n"); // exit? //} cur = cur->next; } scws_free_result(res); } strcat(result, '\0'); printf("%s",result); AS3_CopyCStringToVar(myString, result, strlen(result)); scws_free(s); //scws_free(result); AS3_Trace(myString); AS3_Return("212"); }
static PyObject* participle(Scws* self, PyObject* args){ char *text; if(!PyArg_ParseTuple(args, "s", &text)){ return NULL; } PyObject* result = PyList_New(0); scws_send_text(self->scws, text, strlen(text)); scws_res_t res, cur; while ((cur = res = scws_get_result(self->scws)) != NULL){ while (cur != NULL){ PyObject* aword = PyList_New(2); PyObject* word_text = PyString_FromStringAndSize(text + cur->off, cur->len); PyObject* word_attr = PyString_FromString(cur->attr); PyList_SetItem(aword, 0, word_text); PyList_SetItem(aword, 1, word_attr); PyList_Append(result, aword); Py_DECREF(aword); cur = cur->next; } scws_free_result(res); } return result; }
void updateUniverse() { char *text = "Hello, 我名字叫李那曲是一个中国人, 我有时买Q币来玩, 我还听说过C#语言"; if (!(s = scws_new())) { printf("ERROR: cann't init the scws!\n"); //exit(-1); } scws_set_charset(s, "utf8"); scws_set_dict(s, "dict.utf8.xdb", SCWS_XDICT_XDB); scws_set_rule(s, "rules.utf8.ini"); scws_send_text(s, text, strlen(text)); while (res = cur = scws_get_result(s)) { while (cur != NULL) { printf("WORD: %.*s/%s (IDF = %4.2f)\n", cur->len, text+cur->off, cur->attr, cur->idf); cur = cur->next; } scws_free_result(res); } scws_free(s); }
scws_res_t scws_get_result(scws_t s) { int off, len, ch, clen, zlen, pflag; unsigned char *txt; off = s->off; len = s->len; txt = s->txt; s->res0 = s->res1 = NULL; while ((off < len) && (txt[off] <= 0x20)) { if (txt[off] == 0x0a || txt[off] == 0x0d) { s->off = off + 1; SCWS_PUT_RES(off, 0.0, 1, attr_un); return s->res0; } off++; } if (off >= len) return NULL; /* try to parse the sentence */ s->off = off; ch = txt[off]; if (SCWS_CHAR_TOKEN(ch) && !(s->mode & SCWS_IGN_SYMBOL)) { s->off++; SCWS_PUT_RES(off, 0.0, 1, attr_un); return s->res0; } clen = SCWS_CHARLEN(ch); zlen = 1; pflag = (clen > 1 ? PFLAG_WITH_MB : (SCWS_IS_ALNUM(ch) ? PFLAG_ALNUM : 0)); while ((off = (off+clen)) < len) { ch = txt[off]; if (ch <= 0x20 || SCWS_CHAR_TOKEN(ch)) break; clen = SCWS_CHARLEN(ch); if (!(pflag & PFLAG_WITH_MB)) { // pure single-byte -> multibyte (2bytes) if (clen == 1) { if (pflag & PFLAG_ALNUM) { if (SCWS_IS_ALPHA(ch)) { if (!(pflag & PFLAG_LONGALPHA) && SCWS_IS_ALPHA(txt[off-1])) pflag |= PFLAG_LONGALPHA; } else if (SCWS_IS_DIGIT(ch)) { if (!(pflag & PFLAG_LONGDIGIT) && SCWS_IS_DIGIT(txt[off-1])) pflag |= PFLAG_LONGDIGIT; } else pflag ^= PFLAG_ALNUM; } } else { if (!(pflag & PFLAG_ALNUM) || zlen > 2) break; pflag |= PFLAG_WITH_MB; /* zlen = 1; */ } } else if ((pflag & PFLAG_WITH_MB) && clen == 1) { int i; // mb + single-byte. allowd: alpha+num + 中文 if (!SCWS_IS_ALNUM(ch)) break; pflag &= ~PFLAG_VALID; // 夹在中文间的英文数字最多允许 2 个字符 (超过2可以独立成词没啥问题) for (i = off+1; i < (off+3); i++) { ch = txt[i]; if ((i >= len) || (ch <= 0x20) || (SCWS_CHARLEN(ch) > 1)) { pflag |= PFLAG_VALID; break; } if (!SCWS_IS_ALNUM(ch)) break; } if (!(pflag & PFLAG_VALID)) break; clen += (i - off - 1); } /* hightman.070813: add max zlen limit */ if (++zlen >= SCWS_MAX_ZLEN) break; } /* hightman.070624: 处理半个字的问题 */ if ((ch = off) > len) off -= clen; /* do the real segment */ if (off <= s->off) return NULL; else if (pflag & PFLAG_WITH_MB) _scws_msegment(s, off, zlen); else if (!(pflag & PFLAG_ALNUM) || ((off - s->off) >= SCWS_MAX_EWLEN)) _scws_ssegment(s, off); else { zlen = off - s->off; if ((pflag & (PFLAG_LONGALPHA|PFLAG_LONGDIGIT)) == (PFLAG_LONGALPHA|PFLAG_LONGDIGIT)) _scws_alnum_multi(s, s->off, zlen); else { float idf; idf = SCWS_EN_IDF(zlen); SCWS_PUT_RES(s->off, idf, zlen, attr_en); /* hightman.090523: 为字母数字混合再度拆解, 纯数字, (>1 ? 纯字母 : 数字+字母) */ if ((s->mode & SCWS_MULTI_DUALITY) && zlen > 2) _scws_alnum_multi(s, s->off, zlen); } } /* reutrn the result */ s->off = (ch > len ? len : off); if (s->res0 == NULL) return scws_get_result(s); return s->res0; }