void xtree_put(xtree_t xt, const char *value, const char *key) { if (xt != NULL && key != NULL) xtree_nput(xt, (void *) value, value ? strlen(value) : 0, key, strlen(key)); }
rule_t scws_rule_new(const char *fpath, unsigned char *mblen) { FILE *fp; rule_t r; rule_item_t cr; int i, j, rbl, aflag; rule_attr_t a,rtail; unsigned char buf[512], *str, *ptr, *qtr; /* loaded or open file failed */ if ((fp = fopen(fpath, "r")) == NULL) return NULL; /* alloc the memory */ r = (rule_t) malloc(sizeof(rule_st)); memset(r, 0, sizeof(rule_st)); /* quick scan to add the name to list */ i = j = rbl = aflag = 0; while (fgets(buf, sizeof(buf)-1, fp)) { if (buf[0] != '[' || !(ptr = strchr(buf, ']'))) continue; str = buf + 1; *ptr = '\0'; if (ptr == str || (ptr-str) > 15 || !strcasecmp(str, "attrs")) continue; if (_rule_index_get(r, str) >= 0) continue; strcpy(r->items[i].name, str); r->items[i].tf = 5.0; r->items[i].idf = 3.5; strncpy(r->items[i].attr, "un", 2); if (!strcasecmp(str, "special")) r->items[i].bit = SCWS_RULE_SPECIAL; else if (!strcasecmp(str, "nostats")) r->items[i].bit = SCWS_RULE_NOSTATS; else { r->items[i].bit = (1<<j); j++; } if (++i >= SCWS_RULE_MAX) break; } rewind(fp); /* load the tree data */ if ((r->tree = xtree_new(0, 1)) == NULL) { free(r); return NULL; } cr = NULL; while (fgets(buf, sizeof(buf)-1, fp)) { if (buf[0] == ';') continue; if (buf[0] == '[') { cr = NULL; str = buf + 1; aflag = 0; if ((ptr = strchr(str, ']')) != NULL) { *ptr = '\0'; if (!strcasecmp(str, "attrs")) { aflag = 1; } else if ((i = _rule_index_get(r, str)) >= 0) { rbl = 1; /* default read by line = yes */ cr = &r->items[i]; } } continue; } /* attr flag open? */ if (aflag == 1) { /* parse the attr line */ str = buf; while (*str == ' ' || *str == '\t') str++; if ((ptr = strchr(str, '+')) == NULL) continue; *ptr++ = '\0'; if ((qtr = strchr(ptr, '=')) == NULL) continue; *qtr++ = '\0'; /* create new memory */ a = (rule_attr_t) malloc(sizeof(struct scws_rule_attr)); memset(a, 0, sizeof(struct scws_rule_attr)); /* get ratio */ while(*qtr == ' ' || *qtr == '\t') qtr++; a->ratio = (short) atoi(qtr); if (a->ratio < 1) a->ratio = 1; a->npath[0] = a->npath[1] = 0xff; /* read attr1 & npath1? */ a->attr1[0] = *str++; if (*str && *str != '(' && *str != ' ' && *str != '\t') a->attr1[1] = *str++; while (*str && *str != '(') str++; if (*str == '(') { str++; if ((qtr = strchr(str, ')')) != NULL) { *qtr = '\0'; a->npath[0] = (unsigned char) atoi(str); if (a->npath[0] > 0) a->npath[0]--; else a->npath[0] = 0xff; } } /* read attr1 & npath2? */ str = ptr; while (*str == ' ' || *str == '\t') str++; a->attr2[0] = *str++; if (*str && *str != '(' && *str != ' ' && *str != '\t') a->attr2[1] = *str++; while (*str && *str != '(') str++; if (*str == '(') { str++; if ((qtr = strchr(str, ')')) != NULL) { *qtr = '\0'; a->npath[1] = (unsigned char) atoi(str); if (a->npath[1] > 0) a->npath[1]--; else a->npath[1] = 0xff; } } //printf("%c%c(%d)+%c%c(%d)=%d\n", a->attr1[0], a->attr1[1] ? a->attr1[1] : ' ', a->npath[0], // a->attr2[0], a->attr2[1] ? a->attr2[1] : ' ', a->npath[1], a->ratio); /* append to the chain list */ if (r->attr == NULL) r->attr = rtail = a; else { rtail->next = a; rtail = a; } continue; } if (cr == NULL) continue; /* param set: line|znum|include|exclude|type|tf|idf|attr */ if (buf[0] == ':') { str = buf + 1; if (!(ptr = strchr(str, '='))) continue; while (*str == ' ' || *str == '\t') str++; qtr = ptr + 1; while (ptr > str && (ptr[-1] == ' ' || ptr[-1] == '\t')) ptr--; *ptr = '\0'; ptr = str; str = qtr; while (*str == ' ' || *str == '\t') str++; if (!strcmp(ptr, "line")) rbl = (*str == 'N' || *str == 'n') ? 0 : 1; else if (!strcmp(ptr, "tf")) cr->tf = (float) atof(str); else if (!strcmp(ptr, "idf")) cr->idf = (float) atof(str); else if (!strcmp(ptr, "attr")) strncpy(cr->attr, str, 2); else if (!strcmp(ptr, "znum")) { if ((ptr = strchr(str, ',')) != NULL) { *ptr++ = '\0'; while (*ptr == ' ' || *ptr == '\t') ptr++; cr->zmax = atoi(ptr); cr->flag |= SCWS_ZRULE_RANGE; } cr->zmin = atoi(str); } else if (!strcmp(ptr, "type")) { if (!strncmp(str, "prefix", 6)) cr->flag |= SCWS_ZRULE_PREFIX; else if (!strncmp(str, "suffix", 6)) cr->flag |= SCWS_ZRULE_SUFFIX; } else if (!strcmp(ptr, "include") || !strcmp(ptr, "exclude")) { unsigned int *clude; if (!strcmp(ptr, "include")) { clude = &cr->inc; cr->flag |= SCWS_ZRULE_INCLUDE; } else { clude = &cr->exc; cr->flag |= SCWS_ZRULE_EXCLUDE; } while ((ptr = strchr(str, ',')) != NULL) { while (ptr > str && (ptr[-1] == '\t' || ptr[-1] == ' ')) ptr--; *ptr = '\0'; if ((i = _rule_index_get(r, str)) >= 0) *clude |= r->items[i].bit; str = ptr + 1; while (*str == ' ' || *str == '\t' || *str == ',') str++; } ptr = strlen(str) + str; while (ptr > str && strchr(" \t\r\n", ptr[-1])) ptr--; *ptr = '\0'; if (ptr > str && (i = _rule_index_get(r, str))) *clude |= r->items[i].bit; } continue; } /* read the entries */ str = buf; while (*str == ' ' || *str == '\t') str++; ptr = str + strlen(str); while (ptr > str && strchr(" \t\r\n", ptr[-1])) ptr--; *ptr = '\0'; /* emptry line */ if (ptr == str) continue; if (rbl) xtree_nput(r->tree, cr, sizeof(struct scws_rule_item), str, ptr - str); else { while (str < ptr) { j = mblen[(*str)]; #ifdef DEBUG /* try to check repeat */ if ((i = (int) xtree_nget(r->tree, str, j, NULL)) != 0) fprintf(stderr, "Reapeat word on %s|%s: %.*s\n", cr->name, ((rule_item_t) i)->name, j, str); #endif xtree_nput(r->tree, cr, sizeof(struct scws_rule_item), str, j); str += j; } } } fclose(fp); /* optimize the tree */ xtree_optimize(r->tree); return r; }
/* open the text dict */ static xdict_t _xdict_open_txt(const char *fpath, int mode, unsigned char *ml) { xdict_t xd; xtree_t xt; char buf[XDICT_PATH_MAX], tmpfile[XDICT_PATH_MAX]; struct stat st1, st2; // check the input filepath _realpath(fpath, buf); if (stat(buf, &st1) < 0) return NULL; // check dest file & orginal file, compare there mtime #ifdef WIN32 { char *tmp_ptr; GetTempPath(sizeof(tmpfile) - 20, tmpfile); tmp_ptr = tmpfile + strlen(tmpfile); if (tmp_ptr[-1] == '\\') tmp_ptr--; sprintf(tmp_ptr, "\\scws-%08x.xdb", scws_crc32(buf)); } #else sprintf(tmpfile, "/tmp/scws-%08x.xdb", scws_crc32(buf)); #endif if (!stat(tmpfile, &st2) && st2.st_mtime > st1.st_mtime) { xdb_t x; if ((x = xdb_open(tmpfile, 'r')) != NULL) { xd = (xdict_t) malloc(sizeof(xdict_st)); memset(xd, 0, sizeof(xdict_st)); xd->ref = 1; if (mode & SCWS_XDICT_MEM) { /* convert the xdb(disk) -> xtree(memory) */ if ((xt = xdb_to_xtree(x, NULL)) != NULL) { xdb_close(x); xd->xdict = (void *) xt; xd->xmode = SCWS_XDICT_MEM; return xd; } } xd->xmode = SCWS_XDICT_XDB; xd->xdict = (void *) x; return xd; } } // create xtree if ((xt = xtree_new(0, 0)) == NULL) return NULL; else { int cl, kl; FILE *fp; word_st word, *w; char *key, *part, *last, *delim = " \t\r\n"; // re-build the xdb file from text file if ((fp = fopen(buf, "r")) == NULL) return NULL; // parse every line word.attr[2] = '\0'; while (fgets(buf, sizeof(buf) - 1, fp) != NULL) { // <word>[\t<tf>[\t<idf>[\t<attr>]]] if (buf[0] == ';' || buf[0] == '#') continue; key = _strtok_r(buf, delim, &last); if (key == NULL) continue; kl = strlen(key); // init the word do { word.tf = word.idf = 1.0; word.flag = SCWS_WORD_FULL; word.attr[0] = '@'; word.attr[1] = '\0'; if (!(part = _strtok_r(NULL, delim, &last))) break; word.tf = (float) atof(part); if (!(part = _strtok_r(NULL, delim, &last))) break; word.idf = (float) atof(part); if ((part = _strtok_r(NULL, delim, &last))) { word.attr[0] = part[0]; if (part[1]) word.attr[1] = part[1]; } } while (0); // save into xtree if ((w = xtree_nget(xt, key, kl, NULL)) == NULL) { w = (word_st *) pmalloc(xt->p, sizeof(word_st)); memcpy(w, &word, sizeof(word)); xtree_nput(xt, w, sizeof(word), key, kl); } else { w->tf = word.tf; w->idf = word.idf; w->flag |= word.flag; strcpy(w->attr, word.attr); } // parse the part cl = ml[(unsigned char) (key[0])]; while (1) { cl += ml[(unsigned char) (key[cl])]; if (cl >= kl) break; if ((w = xtree_nget(xt, key, cl, NULL)) != NULL) w->flag |= SCWS_WORD_PART; else { w = (word_st *) pmalloc_z(xt->p, sizeof(word_st)); w->flag = SCWS_WORD_PART; xtree_nput(xt, w, sizeof(word), key, cl); } } } fclose(fp); // optimize the xtree & save to xdb xtree_optimize(xt); unlink(tmpfile); xtree_to_xdb(xt, tmpfile); chmod(tmpfile, 0777); // return xtree xd = (xdict_t) malloc(sizeof(xdict_st)); memset(xd, 0, sizeof(xdict_st)); xd->ref = 1; xd->xdict = (void *) xt; xd->xmode = SCWS_XDICT_MEM; return xd; } }
void scws_rule_json_set(rule_t rules, rule_item_t rule, cJSON *rulevalue) { char *rulename = rule->name, *valuestring, *ptr, *qtr; size_t valuelen, i; // printf("Setting value: %s\n", rulename); if (rulevalue == NULL) return; // attrs if (!strcmp(rulename, "attrs")) { if (rulevalue->type != cJSON_Array || (valuelen = cJSON_GetArraySize(rulevalue)) == 0) return; // while ((rulevalue = rulevalue->next) != NULL) { for (i = 0; i < valuelen; i++) { valuestring = cJSON_GetArrayItem(rulevalue, i)->valuestring; // printf("value line: %s\n", valuestring); if ((ptr = strchr(valuestring, '+')) == NULL) continue; *ptr++ = '\0'; if ((qtr = strchr(valuestring, '=')) == NULL) continue; *qtr++ = '\0'; rule_attr_t value, rtail; value = (rule_attr_t)calloc(1, sizeof(struct scws_rule_attr)); while (isspace(*qtr)) qtr++; value->ratio = (short)atoi(qtr); if (value->ratio < 1) value->ratio = 1; value->npath[0] = value->npath[1] = 0xff; value->attr1[0] = *valuestring++; if (*valuestring && *valuestring != ')' && isspace(*valuestring)) value->attr1[1] = *valuestring++; while (*valuestring && *valuestring != '(') valuestring++; if (*valuestring == '(') { valuestring++; if ((qtr = strchr(valuestring, ')')) != NULL) { *qtr = '\0'; value->npath[0] = (unsigned char)atoi(valuestring); if (value->npath[0] > 0) value->npath[0]--; else value->npath[0] = 0xff; } } valuestring = ptr; while (isspace(*valuestring)) valuestring++; value->attr2[0] = *valuestring++; if (*valuestring && *valuestring != '(' && *valuestring != ' ' && *valuestring != '\t') value->attr2[1] = *valuestring++; while (*valuestring && *valuestring != '(') valuestring++; if (*valuestring == '(') { valuestring++; if ((qtr = strchr(valuestring, ')')) != NULL) { *qtr = '\0'; value->npath[1] = (unsigned char)atoi(valuestring); if (value->npath[1] > 0) value->npath[1]--; else value->npath[1] = 0xff; } } /* append to the chain list */ if (rules->attr == NULL) rules->attr = rtail = value; else { rtail = (rule_attr_t)calloc(1, sizeof(struct scws_rule_attr)); rtail->next = value; rtail = value; } } return; } // if rulename == attrs // valuestring except attrs // array value else if (rulevalue->type == cJSON_Array) { if ((valuelen = cJSON_GetArraySize(rulevalue)) < 1) return; for (i = 0; i < valuelen; i++) { valuestring = cJSON_GetArrayItem(rulevalue, i)->valuestring; // printf("value line(%d): %s\n", (int)i, valuestring); while (isspace(*valuestring)) valuestring++; ptr = valuestring + strlen(valuestring); while (ptr > valuestring && strchr(" \t\r\n", ptr[-1])) ptr--; *ptr = '\0'; if (ptr == valuestring) continue; xtree_nput(rules->tree, rule, sizeof(struct scws_rule_item), valuestring, ptr - valuestring); } } // other types? }