rule_t scws_rule_new(const char *fpath, unsigned char *mblen) { FILE *fp; rule_t r; rule_item_t cr; int i, j, rbl, aflag; rule_attr_t a,rtail; unsigned char buf[512], *str, *ptr, *qtr; /* loaded or open file failed */ if ((fp = fopen(fpath, "r")) == NULL) return NULL; /* alloc the memory */ r = (rule_t) malloc(sizeof(rule_st)); memset(r, 0, sizeof(rule_st)); /* quick scan to add the name to list */ i = j = rbl = aflag = 0; while (fgets(buf, sizeof(buf)-1, fp)) { if (buf[0] != '[' || !(ptr = strchr(buf, ']'))) continue; str = buf + 1; *ptr = '\0'; if (ptr == str || (ptr-str) > 15 || !strcasecmp(str, "attrs")) continue; if (_rule_index_get(r, str) >= 0) continue; strcpy(r->items[i].name, str); r->items[i].tf = 5.0; r->items[i].idf = 3.5; strncpy(r->items[i].attr, "un", 2); if (!strcasecmp(str, "special")) r->items[i].bit = SCWS_RULE_SPECIAL; else if (!strcasecmp(str, "nostats")) r->items[i].bit = SCWS_RULE_NOSTATS; else { r->items[i].bit = (1<<j); j++; } if (++i >= SCWS_RULE_MAX) break; } rewind(fp); /* load the tree data */ if ((r->tree = xtree_new(0, 1)) == NULL) { free(r); return NULL; } cr = NULL; while (fgets(buf, sizeof(buf)-1, fp)) { if (buf[0] == ';') continue; if (buf[0] == '[') { cr = NULL; str = buf + 1; aflag = 0; if ((ptr = strchr(str, ']')) != NULL) { *ptr = '\0'; if (!strcasecmp(str, "attrs")) { aflag = 1; } else if ((i = _rule_index_get(r, str)) >= 0) { rbl = 1; /* default read by line = yes */ cr = &r->items[i]; } } continue; } /* attr flag open? */ if (aflag == 1) { /* parse the attr line */ str = buf; while (*str == ' ' || *str == '\t') str++; if ((ptr = strchr(str, '+')) == NULL) continue; *ptr++ = '\0'; if ((qtr = strchr(ptr, '=')) == NULL) continue; *qtr++ = '\0'; /* create new memory */ a = (rule_attr_t) malloc(sizeof(struct scws_rule_attr)); memset(a, 0, sizeof(struct scws_rule_attr)); /* get ratio */ while(*qtr == ' ' || *qtr == '\t') qtr++; a->ratio = (short) atoi(qtr); if (a->ratio < 1) a->ratio = 1; a->npath[0] = a->npath[1] = 0xff; /* read attr1 & npath1? */ a->attr1[0] = *str++; if (*str && *str != '(' && *str != ' ' && *str != '\t') a->attr1[1] = *str++; while (*str && *str != '(') str++; if (*str == '(') { str++; if ((qtr = strchr(str, ')')) != NULL) { *qtr = '\0'; a->npath[0] = (unsigned char) atoi(str); if (a->npath[0] > 0) a->npath[0]--; else a->npath[0] = 0xff; } } /* read attr1 & npath2? */ str = ptr; while (*str == ' ' || *str == '\t') str++; a->attr2[0] = *str++; if (*str && *str != '(' && *str != ' ' && *str != '\t') a->attr2[1] = *str++; while (*str && *str != '(') str++; if (*str == '(') { str++; if ((qtr = strchr(str, ')')) != NULL) { *qtr = '\0'; a->npath[1] = (unsigned char) atoi(str); if (a->npath[1] > 0) a->npath[1]--; else a->npath[1] = 0xff; } } //printf("%c%c(%d)+%c%c(%d)=%d\n", a->attr1[0], a->attr1[1] ? a->attr1[1] : ' ', a->npath[0], // a->attr2[0], a->attr2[1] ? a->attr2[1] : ' ', a->npath[1], a->ratio); /* append to the chain list */ if (r->attr == NULL) r->attr = rtail = a; else { rtail->next = a; rtail = a; } continue; } if (cr == NULL) continue; /* param set: line|znum|include|exclude|type|tf|idf|attr */ if (buf[0] == ':') { str = buf + 1; if (!(ptr = strchr(str, '='))) continue; while (*str == ' ' || *str == '\t') str++; qtr = ptr + 1; while (ptr > str && (ptr[-1] == ' ' || ptr[-1] == '\t')) ptr--; *ptr = '\0'; ptr = str; str = qtr; while (*str == ' ' || *str == '\t') str++; if (!strcmp(ptr, "line")) rbl = (*str == 'N' || *str == 'n') ? 0 : 1; else if (!strcmp(ptr, "tf")) cr->tf = (float) atof(str); else if (!strcmp(ptr, "idf")) cr->idf = (float) atof(str); else if (!strcmp(ptr, "attr")) strncpy(cr->attr, str, 2); else if (!strcmp(ptr, "znum")) { if ((ptr = strchr(str, ',')) != NULL) { *ptr++ = '\0'; while (*ptr == ' ' || *ptr == '\t') ptr++; cr->zmax = atoi(ptr); cr->flag |= SCWS_ZRULE_RANGE; } cr->zmin = atoi(str); } else if (!strcmp(ptr, "type")) { if (!strncmp(str, "prefix", 6)) cr->flag |= SCWS_ZRULE_PREFIX; else if (!strncmp(str, "suffix", 6)) cr->flag |= SCWS_ZRULE_SUFFIX; } else if (!strcmp(ptr, "include") || !strcmp(ptr, "exclude")) { unsigned int *clude; if (!strcmp(ptr, "include")) { clude = &cr->inc; cr->flag |= SCWS_ZRULE_INCLUDE; } else { clude = &cr->exc; cr->flag |= SCWS_ZRULE_EXCLUDE; } while ((ptr = strchr(str, ',')) != NULL) { while (ptr > str && (ptr[-1] == '\t' || ptr[-1] == ' ')) ptr--; *ptr = '\0'; if ((i = _rule_index_get(r, str)) >= 0) *clude |= r->items[i].bit; str = ptr + 1; while (*str == ' ' || *str == '\t' || *str == ',') str++; } ptr = strlen(str) + str; while (ptr > str && strchr(" \t\r\n", ptr[-1])) ptr--; *ptr = '\0'; if (ptr > str && (i = _rule_index_get(r, str))) *clude |= r->items[i].bit; } continue; } /* read the entries */ str = buf; while (*str == ' ' || *str == '\t') str++; ptr = str + strlen(str); while (ptr > str && strchr(" \t\r\n", ptr[-1])) ptr--; *ptr = '\0'; /* emptry line */ if (ptr == str) continue; if (rbl) xtree_nput(r->tree, cr, sizeof(struct scws_rule_item), str, ptr - str); else { while (str < ptr) { j = mblen[(*str)]; #ifdef DEBUG /* try to check repeat */ if ((i = (int) xtree_nget(r->tree, str, j, NULL)) != 0) fprintf(stderr, "Reapeat word on %s|%s: %.*s\n", cr->name, ((rule_item_t) i)->name, j, str); #endif xtree_nput(r->tree, cr, sizeof(struct scws_rule_item), str, j); str += j; } } } fclose(fp); /* optimize the tree */ xtree_optimize(r->tree); return r; }
/* open the text dict */ static xdict_t _xdict_open_txt(const char *fpath, int mode, unsigned char *ml) { xdict_t xd; xtree_t xt; char buf[XDICT_PATH_MAX], tmpfile[XDICT_PATH_MAX]; struct stat st1, st2; // check the input filepath _realpath(fpath, buf); if (stat(buf, &st1) < 0) return NULL; // check dest file & orginal file, compare there mtime #ifdef WIN32 { char *tmp_ptr; GetTempPath(sizeof(tmpfile) - 20, tmpfile); tmp_ptr = tmpfile + strlen(tmpfile); if (tmp_ptr[-1] == '\\') tmp_ptr--; sprintf(tmp_ptr, "\\scws-%08x.xdb", scws_crc32(buf)); } #else sprintf(tmpfile, "/tmp/scws-%08x.xdb", scws_crc32(buf)); #endif if (!stat(tmpfile, &st2) && st2.st_mtime > st1.st_mtime) { xdb_t x; if ((x = xdb_open(tmpfile, 'r')) != NULL) { xd = (xdict_t) malloc(sizeof(xdict_st)); memset(xd, 0, sizeof(xdict_st)); xd->ref = 1; if (mode & SCWS_XDICT_MEM) { /* convert the xdb(disk) -> xtree(memory) */ if ((xt = xdb_to_xtree(x, NULL)) != NULL) { xdb_close(x); xd->xdict = (void *) xt; xd->xmode = SCWS_XDICT_MEM; return xd; } } xd->xmode = SCWS_XDICT_XDB; xd->xdict = (void *) x; return xd; } } // create xtree if ((xt = xtree_new(0, 0)) == NULL) return NULL; else { int cl, kl; FILE *fp; word_st word, *w; char *key, *part, *last, *delim = " \t\r\n"; // re-build the xdb file from text file if ((fp = fopen(buf, "r")) == NULL) return NULL; // parse every line word.attr[2] = '\0'; while (fgets(buf, sizeof(buf) - 1, fp) != NULL) { // <word>[\t<tf>[\t<idf>[\t<attr>]]] if (buf[0] == ';' || buf[0] == '#') continue; key = _strtok_r(buf, delim, &last); if (key == NULL) continue; kl = strlen(key); // init the word do { word.tf = word.idf = 1.0; word.flag = SCWS_WORD_FULL; word.attr[0] = '@'; word.attr[1] = '\0'; if (!(part = _strtok_r(NULL, delim, &last))) break; word.tf = (float) atof(part); if (!(part = _strtok_r(NULL, delim, &last))) break; word.idf = (float) atof(part); if ((part = _strtok_r(NULL, delim, &last))) { word.attr[0] = part[0]; if (part[1]) word.attr[1] = part[1]; } } while (0); // save into xtree if ((w = xtree_nget(xt, key, kl, NULL)) == NULL) { w = (word_st *) pmalloc(xt->p, sizeof(word_st)); memcpy(w, &word, sizeof(word)); xtree_nput(xt, w, sizeof(word), key, kl); } else { w->tf = word.tf; w->idf = word.idf; w->flag |= word.flag; strcpy(w->attr, word.attr); } // parse the part cl = ml[(unsigned char) (key[0])]; while (1) { cl += ml[(unsigned char) (key[cl])]; if (cl >= kl) break; if ((w = xtree_nget(xt, key, cl, NULL)) != NULL) w->flag |= SCWS_WORD_PART; else { w = (word_st *) pmalloc_z(xt->p, sizeof(word_st)); w->flag = SCWS_WORD_PART; xtree_nput(xt, w, sizeof(word), key, cl); } } } fclose(fp); // optimize the xtree & save to xdb xtree_optimize(xt); unlink(tmpfile); xtree_to_xdb(xt, tmpfile); chmod(tmpfile, 0777); // return xtree xd = (xdict_t) malloc(sizeof(xdict_st)); memset(xd, 0, sizeof(xdict_st)); xd->ref = 1; xd->xdict = (void *) xt; xd->xmode = SCWS_XDICT_MEM; return xd; } }
rule_t scws_rule_json_new(const char *r, int m) { cJSON *json_rules; rule_t rules; rule_item_t rule; char *content; if (m == SCWS_RULE_JSON_STRING) { content = (char *)r; json_rules = cJSON_Parse(content); } else if (m == SCWS_RULE_JSON_FILE) { FILE *fp; if ((fp = fopen(r, "r")) == NULL) return NULL; fseek(fp, 0, SEEK_END); long len = ftell(fp); fseek(fp, 0, SEEK_SET); content = (char*)malloc(len + 1); fread(content, 1, len, fp); fclose(fp); json_rules = cJSON_Parse(content); free(content); } if (!json_rules || json_rules->type != cJSON_Object) { printf("JSON syntax error: %s\n", cJSON_GetErrorPtr()); return NULL; } // alloc rules rules = (rule_t)malloc(sizeof(rule_st)); memset(rules, 0, sizeof(rule_st)); rules->ref = 1; // gc counter if ((rules->tree = xtree_new(0, 1)) == NULL) { free(rules); return NULL; } size_t i = 0; const char *rulename; cJSON *json_rule_ents, *json_rule_ent, *json_rule_values; if ((json_rule_ents = json_rules->child) == NULL) return NULL; json_rule_ent = json_rule_ents; while ((json_rule_ent) != NULL) { rulename = json_rule_ent->string; // printf("\nSetting JSON rule entry: %s\n", rulename); strcpy(rules->items[i].name, json_rule_ent->string); rules->items[i].tf = 5.0; rules->items[i].idf = 3.5; strncpy(rules->items[i].attr, "un", 2); // set rule.bit if (!strcmp(rulename, "special")) rules->items[i].bit = SCWS_RULE_SPECIAL; else if (!strcmp(rulename, "nostats")) rules->items[i].bit = SCWS_RULE_NOSTATS; else rules->items[i].bit = (1 << i); rule = &(rules->items[i]); cJSON *json_rule_attrs; if ((json_rule_attrs = cJSON_GetObjectItem(json_rule_ent, "attrs")) != NULL && json_rule_attrs->type == cJSON_Object) scws_rule_json_set_attrs(rules, rule, json_rule_attrs); if ((json_rule_values = cJSON_GetObjectItem(json_rule_ent, "value")) != NULL) scws_rule_json_set(rules, rule, json_rule_values); i++; json_rule_ent = json_rule_ent->next; } xtree_optimize(rules->tree); cJSON_Delete(json_rules); // free cJSON return rules; }