Exemplo n.º 1
0
void xtree_put(xtree_t xt, const char *value, const char *key)
{
	if (xt != NULL && key != NULL)
		xtree_nput(xt, (void *) value, value ? strlen(value) : 0, key, strlen(key));
}
Exemplo n.º 2
0
rule_t scws_rule_new(const char *fpath, unsigned char *mblen)
{
	FILE *fp;
	rule_t r;
	rule_item_t cr;
	int i, j, rbl, aflag;
	rule_attr_t a,rtail;
	unsigned char buf[512], *str, *ptr, *qtr;

	/* loaded or open file failed */	
	if ((fp = fopen(fpath, "r")) == NULL)
		return NULL;

	/* alloc the memory */
	r = (rule_t) malloc(sizeof(rule_st));
	memset(r, 0, sizeof(rule_st));

	/* quick scan to add the name to list */
	i = j = rbl = aflag = 0;
	while (fgets(buf, sizeof(buf)-1, fp))
	{
		if (buf[0] != '[' || !(ptr = strchr(buf, ']')))
			continue;

		str = buf + 1;
		*ptr = '\0';
		if (ptr == str || (ptr-str) > 15 || !strcasecmp(str, "attrs"))
			continue;

		if (_rule_index_get(r, str) >= 0)
			continue;

		strcpy(r->items[i].name, str);
		r->items[i].tf = 5.0;
		r->items[i].idf = 3.5;
		strncpy(r->items[i].attr, "un", 2);
		if (!strcasecmp(str, "special"))
			r->items[i].bit = SCWS_RULE_SPECIAL;
		else if (!strcasecmp(str, "nostats"))
			r->items[i].bit = SCWS_RULE_NOSTATS;
		else
		{
			r->items[i].bit = (1<<j);
			j++;
		}

		if (++i >= SCWS_RULE_MAX)
			break;
	}
	rewind(fp);

	/* load the tree data */
	if ((r->tree = xtree_new(0, 1)) == NULL)
	{
		free(r);
		return NULL;
	}
	cr = NULL;
	while (fgets(buf, sizeof(buf)-1, fp))
	{
		if (buf[0] == ';')
			continue;

		if (buf[0] == '[')
		{
			cr = NULL;
			str = buf + 1;
			aflag = 0;
			if ((ptr = strchr(str, ']')) != NULL)
			{
				*ptr = '\0';
				if (!strcasecmp(str, "attrs"))
				{
					aflag = 1;
				}
				else if ((i = _rule_index_get(r, str)) >= 0)
				{
					rbl = 1;	/* default read by line = yes */
					cr = &r->items[i];
				}
			}
			continue;
		}

		/* attr flag open? */
		if (aflag == 1)
		{
			/* parse the attr line */
			str = buf;
			while (*str == ' ' || *str == '\t') str++;
			if ((ptr = strchr(str, '+')) == NULL) continue;
			*ptr++ = '\0';
			if ((qtr = strchr(ptr, '=')) == NULL) continue;
			*qtr++ = '\0';

			/* create new memory */
			a = (rule_attr_t) malloc(sizeof(struct scws_rule_attr));
			memset(a, 0, sizeof(struct scws_rule_attr));
			
			/* get ratio */
			while(*qtr == ' ' || *qtr == '\t') qtr++;
			a->ratio = (short) atoi(qtr);
			if (a->ratio < 1)
				a->ratio = 1;
			a->npath[0] = a->npath[1] = 0xff;

			/* read attr1 & npath1? */
			a->attr1[0] = *str++;
			if (*str && *str != '(' && *str != ' ' && *str != '\t')			
				a->attr1[1] = *str++;
			while (*str && *str != '(') str++;
			if (*str == '(')
			{
				str++;
				if ((qtr = strchr(str, ')')) != NULL)
				{
					*qtr = '\0';
					a->npath[0] = (unsigned char) atoi(str);
					if (a->npath[0] > 0)
						a->npath[0]--;
					else
						a->npath[0] = 0xff;
				}
			}

			/* read attr1 & npath2? */
			str = ptr;
			while (*str == ' ' || *str == '\t') str++;
			a->attr2[0] = *str++;
			if (*str && *str != '(' && *str != ' ' && *str != '\t')			
				a->attr2[1] = *str++;
			while (*str && *str != '(') str++;
			if (*str == '(')
			{
				str++;
				if ((qtr = strchr(str, ')')) != NULL)
				{
					*qtr = '\0';
					a->npath[1] = (unsigned char) atoi(str);
					if (a->npath[1] > 0)
						a->npath[1]--;
					else
						a->npath[1] = 0xff;
				}
			}

			//printf("%c%c(%d)+%c%c(%d)=%d\n", a->attr1[0], a->attr1[1] ? a->attr1[1] : ' ', a->npath[0],
			//	a->attr2[0], a->attr2[1] ? a->attr2[1] : ' ', a->npath[1], a->ratio);
			
			/* append to the chain list */
			if (r->attr == NULL)			
				r->attr = rtail = a;
			else
			{
				rtail->next = a;
				rtail = a;
			}

			continue;
		}
		
		if (cr == NULL)
			continue;
		
		/* param set: line|znum|include|exclude|type|tf|idf|attr */
		if (buf[0] == ':')
		{			
			str = buf + 1;
			if (!(ptr = strchr(str, '=')))
				continue;			
			while (*str == ' ' || *str == '\t') str++;			
			
			qtr = ptr + 1;
			while (ptr > str && (ptr[-1] == ' ' || ptr[-1] == '\t')) ptr--;
			*ptr = '\0';				
			ptr = str;
			str = qtr;
			while (*str == ' ' || *str == '\t') str++;	
			
			if (!strcmp(ptr, "line"))				
				rbl =  (*str == 'N' || *str == 'n') ? 0 : 1;
			else if (!strcmp(ptr, "tf"))			
				cr->tf = (float) atof(str); 
			else if (!strcmp(ptr, "idf"))
				cr->idf = (float) atof(str);
			else if (!strcmp(ptr, "attr"))
				strncpy(cr->attr, str, 2);
			else if (!strcmp(ptr, "znum"))
			{			
				if ((ptr = strchr(str, ',')) != NULL)
				{
					*ptr++ = '\0';						
					while (*ptr == ' ' || *ptr == '\t') ptr++;
					cr->zmax = atoi(ptr);
					cr->flag |= SCWS_ZRULE_RANGE;
				}
				cr->zmin = atoi(str);
			}
			else if (!strcmp(ptr, "type"))
			{
				if (!strncmp(str, "prefix", 6))
					cr->flag |= SCWS_ZRULE_PREFIX;
				else if (!strncmp(str, "suffix", 6))
					cr->flag |= SCWS_ZRULE_SUFFIX;
			}
			else if (!strcmp(ptr, "include") || !strcmp(ptr, "exclude"))
			{
				unsigned int *clude;

				if (!strcmp(ptr, "include"))
				{
					clude = &cr->inc;
					cr->flag |= SCWS_ZRULE_INCLUDE;
				}
				else
				{
					clude = &cr->exc;
					cr->flag |= SCWS_ZRULE_EXCLUDE;
				}
				
				while ((ptr = strchr(str, ',')) != NULL)
				{						
					while (ptr > str && (ptr[-1] == '\t' || ptr[-1] == ' ')) ptr--;
					*ptr = '\0';
					if ((i = _rule_index_get(r, str)) >= 0)
						*clude |= r->items[i].bit;
					
					str = ptr + 1;
					while (*str == ' ' || *str == '\t' || *str == ',') str++;
				}
				
				ptr = strlen(str) + str;
				while (ptr > str && strchr(" \t\r\n", ptr[-1])) ptr--;
				*ptr = '\0';
				if (ptr > str && (i = _rule_index_get(r, str)))
					*clude |= r->items[i].bit;
			}	
			continue;
		}

		/* read the entries */
		str = buf;
		while (*str == ' ' || *str == '\t') str++;
		ptr = str + strlen(str);
		while (ptr > str && strchr(" \t\r\n", ptr[-1])) ptr--;
		*ptr = '\0';

		/* emptry line */
		if (ptr == str)
			continue;

		if (rbl)
			xtree_nput(r->tree, cr, sizeof(struct scws_rule_item), str, ptr - str);
		else
		{
			while (str < ptr)
			{
				j = mblen[(*str)];

#ifdef DEBUG
				/* try to check repeat */
				if ((i = (int) xtree_nget(r->tree, str, j, NULL)) != 0)
					fprintf(stderr, "Reapeat word on %s|%s: %.*s\n", cr->name, ((rule_item_t) i)->name, j, str);
#endif

				xtree_nput(r->tree, cr, sizeof(struct scws_rule_item), str, j);
				str += j;
			}
		}	
	}
	fclose(fp);

	/* optimize the tree */
	xtree_optimize(r->tree);
	return r;
}
Exemplo n.º 3
0
/* open the text dict */
static xdict_t _xdict_open_txt(const char *fpath, int mode, unsigned char *ml)
{
	xdict_t xd;
	xtree_t xt;
	char buf[XDICT_PATH_MAX], tmpfile[XDICT_PATH_MAX];
	struct stat st1, st2;

	// check the input filepath
	_realpath(fpath, buf);
	if (stat(buf, &st1) < 0)
		return NULL;

	// check dest file & orginal file, compare there mtime
#ifdef WIN32
	{
		char *tmp_ptr;
		GetTempPath(sizeof(tmpfile) - 20, tmpfile);
		tmp_ptr = tmpfile + strlen(tmpfile);
		if (tmp_ptr[-1] == '\\') tmp_ptr--;
		sprintf(tmp_ptr, "\\scws-%08x.xdb", scws_crc32(buf));
	}
#else
	sprintf(tmpfile, "/tmp/scws-%08x.xdb", scws_crc32(buf));
#endif
	if (!stat(tmpfile, &st2) && st2.st_mtime > st1.st_mtime)
	{
		xdb_t x;
		if ((x = xdb_open(tmpfile, 'r')) != NULL)
		{
			xd = (xdict_t) malloc(sizeof(xdict_st));
			memset(xd, 0, sizeof(xdict_st));
			xd->ref = 1;

			if (mode & SCWS_XDICT_MEM)
			{
				/* convert the xdb(disk) -> xtree(memory) */
				if ((xt = xdb_to_xtree(x, NULL)) != NULL)
				{
					xdb_close(x);
					xd->xdict = (void *) xt;
					xd->xmode = SCWS_XDICT_MEM;
					return xd;
				}
			}
			xd->xmode = SCWS_XDICT_XDB;
			xd->xdict = (void *) x;
			return xd;
		}
	}

	// create xtree
	if ((xt = xtree_new(0, 0)) == NULL)
		return NULL;
	else
	{
		int cl, kl;
		FILE *fp;
		word_st word, *w;
		char *key, *part, *last, *delim = " \t\r\n";

		// re-build the xdb file from text file	
		if ((fp = fopen(buf, "r")) == NULL)
			return NULL;

		// parse every line
		word.attr[2] = '\0';
		while (fgets(buf, sizeof(buf) - 1, fp) != NULL)
		{
			// <word>[\t<tf>[\t<idf>[\t<attr>]]]		
			if (buf[0] == ';' || buf[0] == '#') continue;

			key = _strtok_r(buf, delim, &last);
			if (key == NULL) continue;
			kl = strlen(key);

			// init the word
			do
			{
				word.tf = word.idf = 1.0;
				word.flag = SCWS_WORD_FULL;
				word.attr[0] = '@';
				word.attr[1] = '\0';

				if (!(part = _strtok_r(NULL, delim, &last))) break;
				word.tf = (float) atof(part);

				if (!(part = _strtok_r(NULL, delim, &last))) break;
				word.idf = (float) atof(part);

				if ((part = _strtok_r(NULL, delim, &last)))
				{
					word.attr[0] = part[0];
					if (part[1]) word.attr[1] = part[1];
				}
			}
			while (0);

			// save into xtree
			if ((w = xtree_nget(xt, key, kl, NULL)) == NULL)
			{
				w = (word_st *) pmalloc(xt->p, sizeof(word_st));
				memcpy(w, &word, sizeof(word));
				xtree_nput(xt, w, sizeof(word), key, kl);
			}
			else
			{
				w->tf = word.tf;
				w->idf = word.idf;
				w->flag |= word.flag;
				strcpy(w->attr, word.attr);
			}

			// parse the part	
			cl = ml[(unsigned char) (key[0])];
			while (1)
			{
				cl += ml[(unsigned char) (key[cl])];
				if (cl >= kl) break;

				if ((w = xtree_nget(xt, key, cl, NULL)) != NULL)
					w->flag |= SCWS_WORD_PART;
				else
				{
					w = (word_st *) pmalloc_z(xt->p, sizeof(word_st));
					w->flag = SCWS_WORD_PART;
					xtree_nput(xt, w, sizeof(word), key, cl);
				}
			}
		}
		fclose(fp);

		// optimize the xtree & save to xdb
		xtree_optimize(xt);
		unlink(tmpfile);
		xtree_to_xdb(xt, tmpfile);
		chmod(tmpfile, 0777);

		// return xtree
		xd = (xdict_t) malloc(sizeof(xdict_st));
		memset(xd, 0, sizeof(xdict_st));
		xd->ref = 1;
		xd->xdict = (void *) xt;
		xd->xmode = SCWS_XDICT_MEM;
		return xd;
	}
}
Exemplo n.º 4
0
void scws_rule_json_set(rule_t rules, rule_item_t rule, cJSON *rulevalue)
{
        char *rulename = rule->name, *valuestring, *ptr, *qtr;
        size_t valuelen, i;

        // printf("Setting value: %s\n", rulename);

        if (rulevalue == NULL) return;
        // attrs
        if (!strcmp(rulename, "attrs")) {
                if (rulevalue->type != cJSON_Array
                        || (valuelen = cJSON_GetArraySize(rulevalue)) == 0)
                        return;
                // while ((rulevalue = rulevalue->next) != NULL) {
                for (i = 0; i < valuelen; i++) {
                        valuestring = cJSON_GetArrayItem(rulevalue, i)->valuestring;
                        // printf("value line: %s\n", valuestring);
                        if ((ptr = strchr(valuestring, '+')) == NULL) continue;
                        *ptr++ = '\0';
                        if ((qtr = strchr(valuestring, '=')) == NULL) continue;
                        *qtr++ = '\0';

                        rule_attr_t value, rtail;
                        value = (rule_attr_t)calloc(1, sizeof(struct scws_rule_attr));

                        while (isspace(*qtr)) qtr++;
                        value->ratio = (short)atoi(qtr);
                        if (value->ratio < 1)
                                value->ratio = 1;
                        value->npath[0] = value->npath[1] = 0xff;

                        value->attr1[0] = *valuestring++;
                        if (*valuestring && *valuestring != ')' && isspace(*valuestring))
                                value->attr1[1] = *valuestring++;
                        while (*valuestring && *valuestring != '(') valuestring++;
                        if (*valuestring == '(') {
                                valuestring++;
                                if ((qtr = strchr(valuestring, ')')) != NULL) {
                                        *qtr = '\0';
                                        value->npath[0] = (unsigned char)atoi(valuestring);
                                        if (value->npath[0] > 0)
                                                value->npath[0]--;
                                        else
                                                value->npath[0] = 0xff;
                                }
                        }

                        valuestring = ptr;
                        while (isspace(*valuestring)) valuestring++;
                        value->attr2[0] = *valuestring++;
                        if (*valuestring && *valuestring != '(' && *valuestring != ' ' && *valuestring != '\t')
                                value->attr2[1] = *valuestring++;
                        while (*valuestring && *valuestring != '(') valuestring++;
                        if (*valuestring == '(')
                        {
                                valuestring++;
                                if ((qtr = strchr(valuestring, ')')) != NULL)
                                {
                                        *qtr = '\0';
                                        value->npath[1] = (unsigned char)atoi(valuestring);
                                        if (value->npath[1] > 0)
                                                value->npath[1]--;
                                        else
                                                value->npath[1] = 0xff;
                                }
                        }

                        /* append to the chain list */
                        if (rules->attr == NULL)
                                rules->attr = rtail = value;
                        else {
                                rtail = (rule_attr_t)calloc(1, sizeof(struct scws_rule_attr));
                                rtail->next = value;
                                rtail = value;
                        }
                }
                return;
        } // if rulename == attrs

        // valuestring except attrs
        // array value
        else if (rulevalue->type == cJSON_Array) {
                if ((valuelen = cJSON_GetArraySize(rulevalue)) < 1) return;
                for (i = 0; i < valuelen; i++) {
                        valuestring = cJSON_GetArrayItem(rulevalue, i)->valuestring;
                        // printf("value line(%d): %s\n", (int)i, valuestring);
                        while (isspace(*valuestring)) valuestring++;
                        ptr = valuestring + strlen(valuestring);
                        while (ptr > valuestring && strchr(" \t\r\n", ptr[-1])) ptr--;
                        *ptr = '\0';

                        if (ptr == valuestring) continue;
                        xtree_nput(rules->tree, rule, sizeof(struct scws_rule_item), valuestring, ptr - valuestring);
                }
        }
        // other types?
}