/* public functions */ xtree_t xtree_new(int base, int prime) { xtree_t xnew; pool_t p; p = pool_new(); xnew = pmalloc(p, sizeof(xtree_st)); xnew->p = p; xnew->base = (base ? base : 0xf422f); xnew->prime = (prime ? prime : 31); xnew->count = 0; xnew->trees = (node_t *) pmalloc_z(p, sizeof(node_t) * xnew->prime); return xnew; }
static void _scws_msegment(scws_t s, int end, int zlen) { word_t **wmap, query; struct scws_zchar *zmap; unsigned char *txt; rule_item_t r1; int i, j, k, ch, clen, start; pool_t p; /* pool used to management some dynamic memory */ p = pool_new(); /* create wmap & zmap */ wmap = s->wmap = (word_t **) darray_new(zlen, zlen, sizeof(word_t)); zmap = s->zmap = (struct scws_zchar *) pmalloc(p, zlen * sizeof(struct scws_zchar)); txt = s->txt; start = s->off; s->zis = -1; for (i = 0; start < end; i++) { ch = txt[start]; clen = SCWS_CHARLEN(ch); if (clen == 1) { while (start++ < end) { ch = txt[start]; if (start == end || SCWS_CHARLEN(txt[start]) > 1) break; clen++; } wmap[i][i] = (word_t) pmalloc_z(p, sizeof(word_st)); wmap[i][i]->tf = 0.5; wmap[i][i]->flag |= SCWS_ZFLAG_ENGLISH; strcpy(wmap[i][i]->attr, SCWS_IS_ALPHA(txt[start-1]) ? attr_en : attr_un); } else { query = xdict_query(s->d, txt + start, clen); wmap[i][i] = (word_t) pmalloc(p, sizeof(word_st)); if (query == NULL) { wmap[i][i]->tf = 0.5; wmap[i][i]->idf = 0.0; wmap[i][i]->flag = 0; strcpy(wmap[i][i]->attr, attr_un); } else { ch = query->flag; query->flag = SCWS_WORD_FULL; memcpy(wmap[i][i], query, sizeof(word_st)); if (query->attr[0] == '#') wmap[i][i]->flag |= SCWS_ZFLAG_SYMBOL; if (ch & SCWS_WORD_MALLOCED) free(query); } start += clen; } zmap[i].start = start - clen; zmap[i].end = start; } /* fixed real zlength */ zlen = i; /* create word query table */ for (i = 0; i < zlen; i++) { k = 0; for (j = i+1; j < zlen; j++) { query = xdict_query(s->d, txt + zmap[i].start, zmap[j].end - zmap[i].start); if (query == NULL) break; ch = query->flag; if ((ch & SCWS_WORD_FULL) && memcmp(query->attr, attr_na, 2)) { wmap[i][j] = (word_t) pmalloc(p, sizeof(word_st)); memcpy(wmap[i][j], query, sizeof(word_st)); wmap[i][i]->flag |= SCWS_ZFLAG_WHEAD; for (k = i+1; k <= j; k++) wmap[k][k]->flag |= SCWS_ZFLAG_WPART; } if (ch & SCWS_WORD_MALLOCED) free(query); if (!(ch & SCWS_WORD_PART)) break; } if (k--) { /* set nr2 to some short name */ if ((k == (i+1))) { if (!memcmp(wmap[i][k]->attr, attr_nr, 2)) wmap[i][i]->flag |= SCWS_ZFLAG_NR2; //if (wmap[i][k]->attr[0] == 'n') //wmap[i][i]->flag |= SCWS_ZFLAG_N2; } /* clean the PART flag for the last word */ if (k < j) wmap[i][k]->flag ^= SCWS_WORD_PART; } } if (s->r == NULL) goto do_segment; /* auto rule set for name & zone & chinese numeric */ /* one word auto rule check */ for (i = 0; i < zlen; i++) { if (SCWS_NO_RULE1(wmap[i][i]->flag)) continue; r1 = scws_rule_get(s->r, txt + zmap[i].start, zmap[i].end - zmap[i].start); if (r1 == NULL) continue; clen = r1->zmin > 0 ? r1->zmin : 1; if ((r1->flag & SCWS_ZRULE_PREFIX) && (i < (zlen - clen))) { /* prefix, check after (zmin~zmax) */ // 先检查 zmin 字内是否全部符合要求 // 再在 zmax 范围内取得符合要求的字 // int i, j, k, ch, clen, start; for (ch = 1; ch <= clen; ch++) { j = i + ch; ___ZRULE_CHECKER1___ ___ZRULE_CHECKER3___ } if (ch <= clen) continue; /* no limit znum or limit to a range */ j = i + ch; while (1) { if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax))) break; ___ZRULE_CHECKER1___ ___ZRULE_CHECKER3___ clen++; j++; } // 注意原来2字人名,识别后仍为2字的情况 if (wmap[i][i]->flag & SCWS_ZFLAG_NR2) { if (clen == 1) continue; wmap[i][i+1]->flag |= SCWS_WORD_PART; } /* ok, got: i & clen */ k = i + clen; wmap[i][k] = (word_t) pmalloc(p, sizeof(word_st)); wmap[i][k]->tf = r1->tf; wmap[i][k]->idf = r1->idf; wmap[i][k]->flag = (SCWS_WORD_RULE|SCWS_WORD_FULL); strncpy(wmap[i][k]->attr, r1->attr, 2); wmap[i][i]->flag |= SCWS_ZFLAG_WHEAD; for (j = i+1; j <= k; j++) wmap[j][j]->flag |= SCWS_ZFLAG_WPART; if (!(wmap[i][i]->flag & SCWS_ZFLAG_WPART)) i = k; continue; } if ((r1->flag & SCWS_ZRULE_SUFFIX) && (i >= clen)) { /* suffix, check before */ for (ch = 1; ch <= clen; ch++) { j = i - ch; ___ZRULE_CHECKER2___ ___ZRULE_CHECKER3___ } if (ch <= clen) continue; /* no limit znum or limit to a range */ j = i - ch; while (1) { if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax))) break; ___ZRULE_CHECKER2___ ___ZRULE_CHECKER3___ clen++; j--; } /* ok, got: i & clen (maybe clen=1 & [k][i] isset) */ k = i - clen; if (wmap[k][i] != NULL) continue; wmap[k][i] = (word_t) pmalloc(p, sizeof(word_st)); wmap[k][i]->tf = r1->tf; wmap[k][i]->idf = r1->idf; wmap[k][i]->flag = SCWS_WORD_FULL; strncpy(wmap[k][i]->attr, r1->attr, 2); wmap[k][k]->flag |= SCWS_ZFLAG_WHEAD; for (j = k+1; j <= i; j++) { wmap[j][j]->flag |= SCWS_ZFLAG_WPART; if ((j != i) && (wmap[k][j] != NULL)) wmap[k][j]->flag |= SCWS_WORD_PART; } continue; } } /* two words auto rule check (欧阳** , **西路) */ for (i = zlen - 2; i >= 0; i--) { /* with value ==> must be have SCWS_WORD_FULL, so needn't check it ag. */ if ((wmap[i][i+1] == NULL) || (wmap[i][i+1]->flag & SCWS_WORD_PART)) continue; k = i+1; r1 = scws_rule_get(s->r, txt + zmap[i].start, zmap[k].end - zmap[i].start); if (r1 == NULL) continue; clen = r1->zmin > 0 ? r1->zmin : 1; if ((r1->flag & SCWS_ZRULE_PREFIX) && (k < (zlen - clen))) { for (ch = 1; ch <= clen; ch++) { j = k + ch; ___ZRULE_CHECKER1___ ___ZRULE_CHECKER3___ } if (ch <= clen) continue; /* no limit znum or limit to a range */ j = k + ch; while (1) { if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax))) break; ___ZRULE_CHECKER1___ ___ZRULE_CHECKER3___ clen++; j++; } /* ok, got: i & clen */ k = k + clen; wmap[i][k] = (word_t) pmalloc(p, sizeof(word_st)); wmap[i][k]->tf = r1->tf; wmap[i][k]->idf = r1->idf; wmap[i][k]->flag = SCWS_WORD_FULL; strncpy(wmap[i][k]->attr, r1->attr, 2); wmap[i][i+1]->flag |= SCWS_WORD_PART; for (j = i+2; j <= k; j++) wmap[j][j]->flag |= SCWS_ZFLAG_WPART; i--; continue; } if ((r1->flag & SCWS_ZRULE_SUFFIX) && (i >= clen)) { /* suffix, check before */ for (ch = 1; ch <= clen; ch++) { j = i - ch; ___ZRULE_CHECKER2___ ___ZRULE_CHECKER3___ } if (ch <= clen) continue; /* no limit znum or limit to a range */ j = i - ch; while (1) { if ((!r1->zmax && r1->zmin) || (r1->zmax && (clen >= r1->zmax))) break; ___ZRULE_CHECKER2___ ___ZRULE_CHECKER3___ clen++; j--; } /* ok, got: i & clen (maybe clen=1 & [k][i] isset) */ k = i - clen; i = i + 1; wmap[k][i] = (word_t) pmalloc(p, sizeof(word_st)); wmap[k][i]->tf = r1->tf; wmap[k][i]->idf = r1->idf; wmap[k][i]->flag = SCWS_WORD_FULL; strncpy(wmap[k][i]->attr, r1->attr, 2); wmap[k][k]->flag |= SCWS_ZFLAG_WHEAD; for (j = k+1; j <= i; j++) { wmap[j][j]->flag |= SCWS_ZFLAG_WPART; if (wmap[k][j] != NULL) wmap[k][j]->flag |= SCWS_WORD_PART; } i -= (clen+1); continue; } } /* real do the segment */ do_segment: /* find the easy break point */ for (i = 0, j = 0; i < zlen; i++) { if (wmap[i][i]->flag & SCWS_ZFLAG_WPART) continue; if (i > j) _scws_mseg_zone(s, j, i-1); j = i; if (!(wmap[i][i]->flag & SCWS_ZFLAG_WHEAD)) { _scws_mset_word(s, i, i); j++; } } /* the lastest zone */ if (i > j) _scws_mseg_zone(s, j, i-1); /* the last single for duality */ if ((s->mode & SCWS_DUALITY) && (s->zis >= 0) && !(s->zis & SCWS_ZIS_USED)) { i = s->zis; SCWS_PUT_RES(s->zmap[i].start, s->wmap[i][i]->idf, (s->zmap[i].end - s->zmap[i].start), s->wmap[i][i]->attr); } /* free the wmap & zmap */ pool_free(p); darray_free((void **) wmap); }
/* open the text dict */ static xdict_t _xdict_open_txt(const char *fpath, int mode, unsigned char *ml) { xdict_t xd; xtree_t xt; char buf[XDICT_PATH_MAX], tmpfile[XDICT_PATH_MAX]; struct stat st1, st2; // check the input filepath _realpath(fpath, buf); if (stat(buf, &st1) < 0) return NULL; // check dest file & orginal file, compare there mtime #ifdef WIN32 { char *tmp_ptr; GetTempPath(sizeof(tmpfile) - 20, tmpfile); tmp_ptr = tmpfile + strlen(tmpfile); if (tmp_ptr[-1] == '\\') tmp_ptr--; sprintf(tmp_ptr, "\\scws-%08x.xdb", scws_crc32(buf)); } #else sprintf(tmpfile, "/tmp/scws-%08x.xdb", scws_crc32(buf)); #endif if (!stat(tmpfile, &st2) && st2.st_mtime > st1.st_mtime) { xdb_t x; if ((x = xdb_open(tmpfile, 'r')) != NULL) { xd = (xdict_t) malloc(sizeof(xdict_st)); memset(xd, 0, sizeof(xdict_st)); xd->ref = 1; if (mode & SCWS_XDICT_MEM) { /* convert the xdb(disk) -> xtree(memory) */ if ((xt = xdb_to_xtree(x, NULL)) != NULL) { xdb_close(x); xd->xdict = (void *) xt; xd->xmode = SCWS_XDICT_MEM; return xd; } } xd->xmode = SCWS_XDICT_XDB; xd->xdict = (void *) x; return xd; } } // create xtree if ((xt = xtree_new(0, 0)) == NULL) return NULL; else { int cl, kl; FILE *fp; word_st word, *w; char *key, *part, *last, *delim = " \t\r\n"; // re-build the xdb file from text file if ((fp = fopen(buf, "r")) == NULL) return NULL; // parse every line word.attr[2] = '\0'; while (fgets(buf, sizeof(buf) - 1, fp) != NULL) { // <word>[\t<tf>[\t<idf>[\t<attr>]]] if (buf[0] == ';' || buf[0] == '#') continue; key = _strtok_r(buf, delim, &last); if (key == NULL) continue; kl = strlen(key); // init the word do { word.tf = word.idf = 1.0; word.flag = SCWS_WORD_FULL; word.attr[0] = '@'; word.attr[1] = '\0'; if (!(part = _strtok_r(NULL, delim, &last))) break; word.tf = (float) atof(part); if (!(part = _strtok_r(NULL, delim, &last))) break; word.idf = (float) atof(part); if ((part = _strtok_r(NULL, delim, &last))) { word.attr[0] = part[0]; if (part[1]) word.attr[1] = part[1]; } } while (0); // save into xtree if ((w = xtree_nget(xt, key, kl, NULL)) == NULL) { w = (word_st *) pmalloc(xt->p, sizeof(word_st)); memcpy(w, &word, sizeof(word)); xtree_nput(xt, w, sizeof(word), key, kl); } else { w->tf = word.tf; w->idf = word.idf; w->flag |= word.flag; strcpy(w->attr, word.attr); } // parse the part cl = ml[(unsigned char) (key[0])]; while (1) { cl += ml[(unsigned char) (key[cl])]; if (cl >= kl) break; if ((w = xtree_nget(xt, key, cl, NULL)) != NULL) w->flag |= SCWS_WORD_PART; else { w = (word_st *) pmalloc_z(xt->p, sizeof(word_st)); w->flag = SCWS_WORD_PART; xtree_nput(xt, w, sizeof(word), key, cl); } } } fclose(fp); // optimize the xtree & save to xdb xtree_optimize(xt); unlink(tmpfile); xtree_to_xdb(xt, tmpfile); chmod(tmpfile, 0777); // return xtree xd = (xdict_t) malloc(sizeof(xdict_st)); memset(xd, 0, sizeof(xdict_st)); xd->ref = 1; xd->xdict = (void *) xt; xd->xmode = SCWS_XDICT_MEM; return xd; } }