bool Dict::query(char* url) { unsigned sign1, sign2; Sdict_snode snode; creat_sign_f64(url, strlen(url), &sign1, &sign2); snode.sign1 = sign1; snode.sign2 = sign2; snode.code = -1; db_op1((Sdict_build*)_wordDict, &snode, SEEK); if (-1 != snode.code) { return true; } return false; }
int Dict::add(char* url) { unsigned sign1, sign2; Sdict_snode snode; creat_sign_f64(url, strlen(url), &sign1, &sign2); snode.sign1 = sign1; snode.sign2 = sign2; snode.code = -1; snode.other = 0; if (-1 == db_op1((Sdict_build*)_wordDict, &snode, ADD)) { return -1; } return 0; }
int main(int argc,char** argv) { char buf[1024]; int len; unsigned int sign[2]; unsigned int mode; int output; unsigned long long v1,v2; if(argc!= 3){ fprintf(stdout,"Usage: %s mode output\n",argv[0]); return -1; } output=atoi(argv[2]); mode=atoi(argv[1]); while(fgets(buf,sizeof(buf),stdin)!=NULL){ len=strlen(buf); while((buf[len-1]=='\n')||(buf[len-1]=='\r')){ buf[len-1]=0; len--; } switch(mode){ case 1: creat_sign_f64(buf,len,sign,sign+1); break; case 2: creat_sign_fs64(buf,len,sign,sign+1); break; case 3: creat_sign_7_host(buf,len,sign,sign+1); break; case 4: creat_sign_md64(buf,len,sign,sign+1); break; case 5: creat_sign_mds64(buf,len,sign,sign+1); break; case 6: creat_sign_murmur64(buf,len,sign,sign+1); break; case 7: creat_sign_murmurs64(buf,len,sign,sign+1); break; case 8:{ unsigned int site_sign[2]; int ret1=0,ret2=0; ret1=create_url_sign(buf,site_sign,sign); ret2=create_url_sign(buf,v1,v2); if(output){ fprintf(stdout,"ret1:%d ret2:%d\n",ret1,ret2); fprintf(stdout,"v1:%llu,v2:%llu\n",v1,v2); fprintf(stdout,"site:[0x%x:0x%x]\n",site_sign[0],site_sign[1]); } } break; case 9:{ unsigned int site_sign[2]; int ret1=0,ret2=0; ret1=create_url_sign2(buf,site_sign,sign); ret2=create_url_sign2(buf,v1,v2); if(output){ fprintf(stdout,"ret1:%d ret2:%d\n",ret1,ret2); fprintf(stdout,"v1:%llu,v2:%llu\n",v1,v2); fprintf(stdout,"site:[0x%x:0x%x]\n",site_sign[0],site_sign[1]); } } break; default: sign[0]=0; sign[1]=0; break; } if(output){ //fprintf(stdout,"%s [%u:%u]\n", buf,sign[0],sign[1]); fprintf(stdout,"[0x%x:0x%x]\n",sign[0],sign[1]); } } }
// 混排结果中短语动态拆分,得到更合理的语义粒度,其粒度介于混排和基本词之间。 int seg_split( Sdict_search* phrasedict, scw_out_t* pout, token_t subtokens[], int tcnt ) { int ret = 0; Sdict_snode snode; int ds_ret = 0; if (pout == NULL || phrasedict == NULL) { WARNING_LOG("error: pout or phrasedict is null!"); return ret; } //混排结果 for (u_int i = 0; i < pout->wpbtermcount; i++) { u_int pos = GET_TERM_POS( pout->wpbtermpos[i] ); u_int len = GET_TERM_LEN( pout->wpbtermpos[i] ); u_int off = pout->wpbtermoffsets[i]; char term[256] = "\0"; if(len >= 256) { WARNING_LOG("error: term length longer than 256!"); return ret; } strncpy( term, pout->wpcompbuf+pos, len ); term[len] = '\0'; //book name combine // char *p = pout->wpcompbuf+pos; // if (strncmp( p, "《", 2 ) == 0 && strncmp( p+len-2, "》", 2 ) == 0) if(IS_BOOKNAME(pout->wpbtermprop[i])) { u_int bsccnt = 0; u_int boff = off; u_int rlen = len; while (rlen > 0) { bsccnt += 1;//basic个数 rlen -= GET_TERM_LEN( pout->wsbtermpos[boff] ); boff += 1; } if(IS_BOOKNAME_START(pout->wsbtermprop[off]) && IS_BOOKNAME_END(pout->wsbtermprop[off + bsccnt - 1])) { if (bsccnt == 3) { if (ret + 3 > tcnt) { WARNING_LOG("error : term result number more than max term count!"); return ret; } boff = off; while (boff < off+3) { u_int bpos = GET_TERM_POS( pout->wsbtermpos[boff] ); u_int blen = GET_TERM_LEN( pout->wsbtermpos[boff] ); strncpy( subtokens[ret].buffer, pout->wordsepbuf+bpos, blen ); subtokens[ret].buffer[blen] = '\0'; subtokens[ret].offset = boff; subtokens[ret].length = blen; subtokens[ret].prop.m_hprop = pout->wsbtermprop[boff].m_hprop; subtokens[ret].prop.m_lprop = pout->wsbtermprop[boff].m_lprop; subtokens[ret].weight = 0;//pout没有weight和index字段,赋初值为0 subtokens[ret].index = 0; boff += 1; ret += 1; } continue; } if(ret + 1 > tcnt) { WARNING_LOG("error : term result number more than max term count!"); return ret; } boff = off; u_int bpos = GET_TERM_POS( pout->wsbtermpos[boff] ); u_int blen = GET_TERM_LEN( pout->wsbtermpos[boff] ); strncpy( subtokens[ret].buffer, pout->wordsepbuf+bpos, blen ); subtokens[ret].buffer[blen] = '\0'; subtokens[ret].offset = boff; subtokens[ret].length = blen; subtokens[ret].prop.m_hprop = pout->wsbtermprop[boff].m_hprop; subtokens[ret].prop.m_lprop = pout->wsbtermprop[boff].m_lprop; subtokens[ret].weight = 0;//pout没有weight和index字段,赋初值为0 subtokens[ret].index = 0; ret += 1; //build complete seg from subphrase buffer rlen = len - 4; boff = off + 1; while (rlen > 0) { u_int lentemp = len; int maxsubphridx = -1; unsigned int maxsubphrlen = 0; while(1) { for (int j = 0; j < (int)pout->spbtermcount; j++) { //u_int spos = GET_TERM_POS( pout->spbtermpos[j] ); u_int slen = GET_TERM_LEN( pout->spbtermpos[j] ); //pass not aligned subphrase if (pout->spbtermoffsets[j] != boff) { continue; } //pass itself,or longer than it if (slen >= len || slen >= lentemp) { continue; } if (maxsubphrlen < slen) { maxsubphridx = j; maxsubphrlen = slen; } } if (maxsubphridx >= 0) { u_int spos = GET_TERM_POS( pout->spbtermpos[maxsubphridx] ); u_int slen = GET_TERM_LEN( pout->spbtermpos[maxsubphridx] ); char sterm[256] = "\0"; if(slen >= 256) { WARNING_LOG("error: term length longer than 256!"); return ret; } strncpy( sterm, pout->subphrbuf+spos, slen ); sterm[slen] = '\0'; //pass subphrase can split creat_sign_f64 (sterm, slen, &(snode.sign1), &(snode.sign2)); ds_ret = ds_op1 (phrasedict, &snode, SEEK); if ( ds_ret == 1) { lentemp = slen; // j = -1; maxsubphridx = -1; maxsubphrlen = 0; continue; } else if(ds_ret != 0) { WARNING_LOG("error: ds_op1 result error!"); return ret; } else { break; } } break; } //copy max subphrase if (maxsubphridx >= 0) { if(ret + 1 > tcnt) { WARNING_LOG("error : term result number more than max term count!"); return ret; } u_int mpos = GET_TERM_POS( pout->spbtermpos[maxsubphridx] ); u_int mlen = GET_TERM_LEN( pout->spbtermpos[maxsubphridx] ); u_int moff = pout->spbtermoffsets[maxsubphridx]; strncpy( subtokens[ret].buffer, pout->subphrbuf+mpos, mlen ); subtokens[ret].buffer[mlen] = '\0'; subtokens[ret].offset = moff; subtokens[ret].length = mlen; subtokens[ret].prop.m_hprop = pout->spbtermprop[maxsubphridx].m_hprop; subtokens[ret].prop.m_lprop = pout->spbtermprop[maxsubphridx].m_lprop; subtokens[ret].weight = 0;//pout没有weight和index字段,赋初值为0 subtokens[ret].index = 0; ret += 1; rlen -= mlen; int bcnt = 1; mlen -= GET_TERM_LEN( pout->wsbtermpos[moff] ); while (mlen > 0) { bcnt += 1; moff += 1; mlen -= GET_TERM_LEN( pout->wsbtermpos[moff] ); } boff += bcnt; } //copy basic else { if(ret + 1 > tcnt) { WARNING_LOG("error : term result number more than max term count!"); return ret; } u_int bpos = GET_TERM_POS( pout->wsbtermpos[boff] ); u_int blen = GET_TERM_LEN( pout->wsbtermpos[boff] ); strncpy( subtokens[ret].buffer, pout->wordsepbuf+bpos, blen ); subtokens[ret].buffer[blen] = '\0'; subtokens[ret].offset = boff; subtokens[ret].length = blen; subtokens[ret].prop.m_hprop = pout->wsbtermprop[boff].m_hprop; subtokens[ret].prop.m_lprop = pout->wsbtermprop[boff].m_lprop; subtokens[ret].weight = 0;//pout没有weight和index字段,赋初值为0 subtokens[ret].index = 0; ret += 1; rlen -= blen; boff += 1; } } if(ret + 1 > tcnt) { WARNING_LOG("error : term result number more than max term count!"); return ret; } bpos = GET_TERM_POS( pout->wsbtermpos[boff] ); blen = GET_TERM_LEN( pout->wsbtermpos[boff] ); strncpy( subtokens[ret].buffer, pout->wordsepbuf+bpos, blen ); subtokens[ret].buffer[blen] = '\0'; subtokens[ret].offset = boff; subtokens[ret].length = blen; subtokens[ret].prop.m_hprop = pout->wsbtermprop[boff].m_hprop; subtokens[ret].prop.m_lprop = pout->wsbtermprop[boff].m_lprop; subtokens[ret].weight = 0;//pout没有weight和index字段,赋初值为0 subtokens[ret].index = 0; ret += 1; continue; } } creat_sign_f64 (term, len, &(snode.sign1), &(snode.sign2)); if (ds_op1 (phrasedict, &snode, SEEK) != 1) { if(ret + 1 > tcnt) { WARNING_LOG("error : term result number more than max term count!"); return ret; } strncpy( subtokens[ret].buffer, term, len ); subtokens[ret].buffer[len] = '\0'; subtokens[ret].offset = off; subtokens[ret].length = len; subtokens[ret].prop.m_hprop = pout->wpbtermprop[i].m_hprop; subtokens[ret].prop.m_lprop = pout->wpbtermprop[i].m_lprop; subtokens[ret].weight = 0;//pout没有weight和index字段,赋初值为0 subtokens[ret].index = 0; ret += 1; continue; } u_int rlen = len; u_int boff = off; while (rlen > 0) { u_int lentemp = len;//phrase itself length int maxsubphridx = -1; unsigned int maxsubphrlen = 0; while(1) { for (int j = 0; j < (int)pout->spbtermcount; j++) { //u_int spos = GET_TERM_POS( pout->spbtermpos[j] ); u_int slen = GET_TERM_LEN( pout->spbtermpos[j] ); //pass not aligned subphrase if (pout->spbtermoffsets[j] != boff) { continue; } //pass itself if (slen >= len || slen >= lentemp) { continue; } if (maxsubphrlen < slen) { maxsubphridx = j; maxsubphrlen = slen; } } if (maxsubphridx >= 0) { u_int spos = GET_TERM_POS( pout->spbtermpos[maxsubphridx] ); u_int slen = GET_TERM_LEN( pout->spbtermpos[maxsubphridx] ); char sterm[256] = "\0"; if(slen >= 256) { WARNING_LOG("error: term length longer than 256!"); return ret; } strncpy( sterm, pout->subphrbuf+spos, slen ); sterm[slen] = '\0'; //pass subphrase can split creat_sign_f64 (sterm, slen, &(snode.sign1), &(snode.sign2)); ds_ret = ds_op1 (phrasedict, &snode, SEEK); if ( ds_ret == 1) { lentemp = slen; //j = -1; maxsubphridx = -1; maxsubphrlen = 0; continue; } else if(ds_ret != 0) { WARNING_LOG("error: ds_op1 result error!"); return ret; } else { break; } } break; } //copy max subphrase if (maxsubphridx >= 0) { if(ret + 1 > tcnt) { WARNING_LOG("error : term result number more than max term count!"); return ret; } int mpos = GET_TERM_POS( pout->spbtermpos[maxsubphridx] ); int mlen = GET_TERM_LEN( pout->spbtermpos[maxsubphridx] ); int moff = pout->spbtermoffsets[maxsubphridx]; strncpy( subtokens[ret].buffer, pout->subphrbuf+mpos, mlen ); subtokens[ret].buffer[mlen] = '\0'; subtokens[ret].offset = moff; subtokens[ret].length = mlen; subtokens[ret].prop.m_hprop = pout->spbtermprop[maxsubphridx].m_hprop; subtokens[ret].prop.m_lprop = pout->spbtermprop[maxsubphridx].m_lprop; subtokens[ret].weight = 0;//pout没有weight和index字段,赋初值为0 subtokens[ret].index = 0; ret += 1; rlen -= mlen; int sbcnt = 1; mlen -= GET_TERM_LEN( pout->wsbtermpos[moff] );//获得basic个数 while (mlen > 0) { sbcnt += 1; moff += 1; mlen -= GET_TERM_LEN( pout->wsbtermpos[moff] ); } boff += sbcnt; } //copy basic else { if(ret + 1 > tcnt) { WARNING_LOG("error : term result number more than max term count!"); return ret; } int bpos = GET_TERM_POS( pout->wsbtermpos[boff] ); int blen = GET_TERM_LEN( pout->wsbtermpos[boff] ); strncpy( subtokens[ret].buffer, pout->wordsepbuf+bpos, blen ); subtokens[ret].buffer[blen] = '\0'; subtokens[ret].offset = boff; subtokens[ret].length = blen; subtokens[ret].prop.m_hprop = pout->wsbtermprop[boff].m_hprop; subtokens[ret].prop.m_lprop = pout->wsbtermprop[boff].m_lprop; subtokens[ret].weight = 0;//pout没有weight和index字段,赋初值为0 subtokens[ret].index = 0; ret += 1; rlen -= blen; boff += 1; } } } return ret; }
// // 混排结果中短语动态拆分,得到更合理的语义粒度,其粒度介于混排和基本词之间。 //新接口的调用函数 int seg_split_tokenize( Sdict_search* phrasedict, handle_t handle, token_t token, token_t subtokens[], int tcnt) { int ret = 0; Sdict_snode snode; token_t subphrase[256];//这样的话,用户还怎么指定啊? token_t basic[256]; int ds_ret = 0; if (token.length <= 2) { memcpy( subtokens, &token, sizeof( token_t ) ); ret = 1; return ret; } //book name combine // char *p = token.buffer; // if (strncmp( p, "《", 2 ) == 0 && strncmp( p+token.length-2, "》", 2 ) == 0) int subcnt = seg_tokenize(handle, token, TOK_SUBPHR,subphrase,256); int bsccnt = seg_tokenize(handle, token, TOK_BASIC, basic, 256); if(IS_BOOKNAME(token.prop) && IS_BOOKNAME_START(basic[0].prop) && IS_BOOKNAME_END(basic[bsccnt - 1].prop)) { // int subcnt = seg_tokenize(handle, token, TOK_SUBPHR,subphrase,256); // int bsccnt = seg_tokenize(handle, token, TOK_BASIC, basic, 256); if (bsccnt == 3) { if (ret + 3 > tcnt) { WARNING_LOG("error : term result number more than max term count!"); return ret; } memcpy( subtokens+ret, basic, sizeof( token_t ) ); ret += 1; memcpy( subtokens+ret, basic+1, sizeof( token_t ) ); ret += 1; memcpy( subtokens+ret, basic+2, sizeof( token_t ) ); ret += 1; return ret; } if (ret + 1 > tcnt) { WARNING_LOG("error : term result number more than max term count!"); return ret; } memcpy( subtokens+ret, basic, sizeof( token_t ) ); ret += 1; //build complete seg from subphrase buffer unsigned int len = token.length-4; unsigned int off = 2; while (len > 0) { u_int splitlen = token.length;//最大拆分phrase的长度 u_int tmplen = token.length; int maxsubphridx = -1; unsigned int maxsubphrlen = 0; while(1) { for (int i = 0; i < subcnt; i++) { //pass not aligned subphrase if (subphrase[i].offset != off) continue; //pass itself if (subphrase[i].length >= tmplen || subphrase[i].length >= splitlen) continue; if (maxsubphrlen < subphrase[i].length) { maxsubphridx = i; maxsubphrlen = subphrase[i].length; } } if(maxsubphridx >= 0) { //pass subphrase can split creat_sign_f64 (subphrase[maxsubphridx].buffer, subphrase[maxsubphridx].length, &(snode.sign1), &(snode.sign2)); ds_ret = ds_op1 (phrasedict, &snode, SEEK); if (ds_ret == 1) { splitlen = maxsubphrlen; maxsubphridx = -1; maxsubphrlen = 0; continue; } else if(ds_ret != 0) { WARNING_LOG("error: ds_op1 result error!"); return ret; } else { break; } } break; } //copy max subphrase if (maxsubphridx >= 0) { if (ret + 1 > tcnt) { WARNING_LOG("error : term result number more than max term count!"); return ret; } memcpy( subtokens+ret, subphrase+maxsubphridx, sizeof( token_t ) ); ret += 1; len -= subphrase[maxsubphridx].length; off += subphrase[maxsubphridx].length; } //copy basic else { if (ret + 1 > tcnt) { WARNING_LOG("error : term result number more than max term count!"); return ret; } int boff = 0; while (boff < bsccnt && basic[boff].offset != off) { boff += 1; } if((boff >= bsccnt) || (basic[boff].offset != off)) { WARNING_LOG("basic offset exceeds basic count"); return ret; } //assert (boff < bsccnt ); //assert (basic[boff].offset == off); memcpy( subtokens+ret, basic+boff, sizeof( token_t ) ); ret += 1; len -= basic[boff].length; off += basic[boff].length; } } if (ret + 1 > tcnt) { WARNING_LOG("error : term result number more than max term count!"); return ret; } memcpy( subtokens+ret, basic+bsccnt-1, sizeof( token_t ) ); ret += 1; return ret; } creat_sign_f64 (token.buffer, token.length, &(snode.sign1), &(snode.sign2)); if (ds_op1 (phrasedict, &snode, SEEK) != 1) { memcpy( subtokens, &token, sizeof( token_t ) ); ret = 1; return ret; } // int subcnt = seg_tokenize(handle, token,TOK_SUBPHR,subphrase,32); // int bsccnt = seg_tokenize(handle, token, TOK_BASIC, basic, 32); if (bsccnt == 1)//unexpect : phrasedict error { memcpy( subtokens, &token, sizeof( token_t ) ); ret = 1; return ret; } //split phrase unsigned int len = token.length; unsigned int off = 0; while (len > 0) { u_int splitlen = token.length; u_int tmplen = token.length; int maxsubphridx = -1; unsigned int maxsubphrlen = 0; while(1) { for (int i = 0; i < subcnt; i++) { //pass not aligned subphrase if (subphrase[i].offset != off) continue; //pass itself if (subphrase[i].length >= tmplen || subphrase[i].length >= splitlen) continue; if (maxsubphrlen < subphrase[i].length) { maxsubphridx = i; maxsubphrlen = subphrase[i].length; } } if(maxsubphridx >= 0) { //pass subphrase can split creat_sign_f64 (subphrase[maxsubphridx].buffer, subphrase[maxsubphridx].length, &(snode.sign1), &(snode.sign2)); ds_ret = ds_op1 (phrasedict, &snode, SEEK); if ( ds_ret == 1) { splitlen = maxsubphrlen; maxsubphridx = -1; maxsubphrlen = 0; continue; } else if(ds_ret != 0) { WARNING_LOG("error: ds_op1 result error!"); return ret; } else { break; } } break; } //copy max subphrase if (maxsubphridx >= 0) { if (ret + 1 > tcnt) { WARNING_LOG("error : term result number more than max term count!"); return ret; } memcpy( subtokens+ret, subphrase+maxsubphridx, sizeof( token_t ) ); ret += 1; len -= subphrase[maxsubphridx].length; off += subphrase[maxsubphridx].length; } //copy basic else { if (ret + 1 > tcnt) { WARNING_LOG("error : term result number more than max term count!"); return ret; } int boff = 0; while (boff < bsccnt && basic[boff].offset != off) { boff += 1; } if((boff >= bsccnt) || (basic[boff].offset != off)) { WARNING_LOG("basic offset exceeds basic count"); return ret; } //assert (boff < bsccnt ); //assert (basic[boff].offset == off); memcpy( subtokens+ret, basic+boff, sizeof( token_t ) ); ret += 1; len -= basic[boff].length; off += basic[boff].length; } } return ret; }