static void _xdb_load_nodes(xdb_t x, xptr_t ptr, xcmper_st *nodes, int *count) { int i; unsigned char buf[XDB_MAXKLEN + 18]; if (ptr->len == 0) return; i = sizeof(buf)-1; if (i > (int)ptr->len) i = ptr->len; _xdb_read_data(x, buf, ptr->off, i); i = *count; nodes[i].ptr.off = ptr->off; nodes[i].ptr.len = ptr->len; nodes[i].key = (char *) _mem_ndup(buf + 17, buf[16]); *count = i+1; /* left & right */ memcpy(ptr, buf, sizeof(xptr_st)); _xdb_load_nodes(x, ptr, nodes, count); memcpy(ptr, buf + sizeof(xptr_st), sizeof(xptr_st)); _xdb_load_nodes(x, ptr, nodes, count); }
static void _scws_ssegment(scws_t s, int end) { int start, wlen, ch, pflag, ipflag = 0; unsigned char *txt; float idf; start = s->off; wlen = end - start; /* check special words (need strtoupper) */ if (wlen > 1) { txt = (char *) _mem_ndup(s->txt + start, wlen); _str_toupper(txt, txt); if (SCWS_IS_SPECIAL(txt, wlen)) { SCWS_PUT_RES(start, 9.5, wlen, "nz"); free(txt); return; } free(txt); } txt = s->txt; /* check brief words such as S.H.E M.R. */ if (SCWS_IS_ALPHA(txt[start]) && txt[start+1] == '.') { for (ch = start + 2; ch < end; ch++) { if (!SCWS_IS_ALPHA(txt[ch])) break; ch++; if (ch == end || txt[ch] != '.') break; } if (ch == end) { SCWS_PUT_RES(start, 7.5, wlen, "nz"); return; } } /* 取出单词及标点. 数字允许一个点且下一个为数字,不连续的. 字母允许一个不连续的' */ while (start < end) { ch = txt[start++]; if (ipflag && ch != 0x2e && !SCWS_IS_DIGIT(ch)) ipflag = 0; if (SCWS_IS_ALNUM(ch)) { pflag = SCWS_IS_DIGIT(ch) ? PFLAG_DIGIT : 0; wlen = 1; while (start < end) { ch = txt[start]; if (pflag & PFLAG_DIGIT) { if (!SCWS_IS_DIGIT(ch)) { // check percent % = 0x25 if (ch == 0x25 && !SCWS_IS_DIGIT(txt[start+1])) { start++; wlen++; break; } if (ipflag) break; // special for IP address or version number? (find out all digit + dot) if (ch == 0x2e && (pflag & PFLAG_ADDSYM)) { ipflag = 1; while(--wlen && txt[--start] != 0x2e); pflag = 0; break; } // wlen = 1 if (wlen == 1 && SCWS_IS_ALPHA(ch)) { pflag ^= PFLAG_DIGIT; pflag |= PFLAG_ADDSYM; continue; } // strict must add: !$this->_is_digit(ord($this->txt[$start+1]))) if ((pflag & PFLAG_ADDSYM) || !(ch == 0x2e && SCWS_IS_DIGIT(txt[start+1]))) break; pflag |= PFLAG_ADDSYM; } } else { /* hightman.110419: - 出现在字母中间允许连接(0x2d), _ 允许连接(0x5f) */ if ((ch == 0x2d || ch == 0x5f) && SCWS_IS_ALPHA(txt[start+1])) pflag |= PFLAG_ADDSYM; else if (!SCWS_IS_ALPHA(ch)) { if ((pflag & PFLAG_ADDSYM) || !((ch == 0x27 && SCWS_IS_ALPHA(txt[start+1])) || (SCWS_IS_DIGIT(ch) && !SCWS_IS_DIGIT(txt[start+1])))) { break; } pflag |= PFLAG_ADDSYM; } } start++; wlen++; if (wlen >= SCWS_MAX_EWLEN) break; } idf = SCWS_EN_IDF(wlen); SCWS_PUT_RES(start-wlen, idf, wlen, attr_en); if ((s->mode & SCWS_MULTI_DUALITY) && (pflag & PFLAG_ADDSYM)) _scws_alnum_multi(s, start-wlen, wlen); } else if (!(s->mode & SCWS_IGN_SYMBOL)) { SCWS_PUT_RES(start-1, 0.0, 1, attr_un); } } }