Ejemplo n.º 1
0
bool Dict::query(char* url)
{
    unsigned sign1, sign2;
    Sdict_snode snode;

    creat_sign_f64(url, strlen(url), &sign1, &sign2);
    snode.sign1 = sign1;
    snode.sign2 = sign2;
    snode.code = -1;
    db_op1((Sdict_build*)_wordDict, &snode, SEEK);
    if (-1 != snode.code) {
        return true;
    }
    return false;
}
Ejemplo n.º 2
0
int Dict::add(char* url)
{
    unsigned sign1, sign2;
    Sdict_snode snode;

    creat_sign_f64(url, strlen(url), &sign1, &sign2);
    snode.sign1 = sign1;
    snode.sign2 = sign2;
    snode.code = -1;
    snode.other = 0;
    if (-1 == db_op1((Sdict_build*)_wordDict, &snode, ADD)) {
        return -1;
    }
    return 0;
}
Ejemplo n.º 3
0
int main(int argc,char** argv)
{
    char buf[1024];
    int len;
    unsigned int sign[2];
    unsigned int mode;
    int output;
    unsigned long long v1,v2;    
    if(argc!= 3){
        fprintf(stdout,"Usage: %s mode output\n",argv[0]);
        return -1;
    }
    output=atoi(argv[2]);
    mode=atoi(argv[1]);
    while(fgets(buf,sizeof(buf),stdin)!=NULL){
        len=strlen(buf);
        while((buf[len-1]=='\n')||(buf[len-1]=='\r')){
            buf[len-1]=0;
            len--;
        }
        switch(mode){
            case 1:
                creat_sign_f64(buf,len,sign,sign+1);
                break;
            case 2:
                creat_sign_fs64(buf,len,sign,sign+1);
                break;
            case 3:
                creat_sign_7_host(buf,len,sign,sign+1);
                break;
            case 4:
                creat_sign_md64(buf,len,sign,sign+1);
                break;
            case 5:
                creat_sign_mds64(buf,len,sign,sign+1);
                break;
            case 6:
                creat_sign_murmur64(buf,len,sign,sign+1);
                break;
            case 7:
                creat_sign_murmurs64(buf,len,sign,sign+1);
                break;
            case 8:{
                unsigned int site_sign[2];
                int ret1=0,ret2=0;
                ret1=create_url_sign(buf,site_sign,sign);
                ret2=create_url_sign(buf,v1,v2);
                if(output){
                    fprintf(stdout,"ret1:%d ret2:%d\n",ret1,ret2);
                    fprintf(stdout,"v1:%llu,v2:%llu\n",v1,v2);
                    fprintf(stdout,"site:[0x%x:0x%x]\n",site_sign[0],site_sign[1]);
                }
            }
                break;
            case 9:{
                unsigned int site_sign[2];
                int ret1=0,ret2=0;
                ret1=create_url_sign2(buf,site_sign,sign);
                ret2=create_url_sign2(buf,v1,v2);
                if(output){
                    fprintf(stdout,"ret1:%d ret2:%d\n",ret1,ret2);
                    fprintf(stdout,"v1:%llu,v2:%llu\n",v1,v2);
                    fprintf(stdout,"site:[0x%x:0x%x]\n",site_sign[0],site_sign[1]);
                }
            }
                break;
            default:
                sign[0]=0;
                sign[1]=0;
                break;
        }
        if(output){
            //fprintf(stdout,"%s [%u:%u]\n", buf,sign[0],sign[1]);
            fprintf(stdout,"[0x%x:0x%x]\n",sign[0],sign[1]);
        }
    }
}
Ejemplo n.º 4
0
// 混排结果中短语动态拆分,得到更合理的语义粒度,其粒度介于混排和基本词之间。
int seg_split( Sdict_search* phrasedict, scw_out_t* pout, token_t subtokens[], int tcnt )
{
	int ret = 0;
	Sdict_snode snode;    
	int ds_ret = 0;

	if (pout == NULL  || phrasedict == NULL)
	{      
		WARNING_LOG("error: pout or phrasedict is null!");
		return ret;
	}

	//混排结果
	for (u_int i = 0; i < pout->wpbtermcount; i++)
	{
		u_int pos = GET_TERM_POS( pout->wpbtermpos[i] );
		u_int len = GET_TERM_LEN( pout->wpbtermpos[i] );
		u_int off = pout->wpbtermoffsets[i];

		char term[256] = "\0";
		if(len >= 256)
		{
			WARNING_LOG("error: term length longer than 256!");
			return ret;
		}
		strncpy( term, pout->wpcompbuf+pos, len );
		term[len] = '\0';

		//book name combine
		//        char *p = pout->wpcompbuf+pos;
		//       if (strncmp( p, "《", 2 ) == 0 &&  strncmp( p+len-2, "》", 2 ) == 0)
		if(IS_BOOKNAME(pout->wpbtermprop[i]))
		{          
			u_int bsccnt = 0;        
			u_int boff = off;
			u_int rlen = len;

			while (rlen > 0)
			{
				bsccnt += 1;//basic个数
				rlen -= GET_TERM_LEN( pout->wsbtermpos[boff] );
				boff += 1;
			}

			if(IS_BOOKNAME_START(pout->wsbtermprop[off]) && IS_BOOKNAME_END(pout->wsbtermprop[off + bsccnt - 1]))
			{
				if (bsccnt == 3)
				{
					if (ret + 3 > tcnt)
					{
						WARNING_LOG("error : term result number more than max term count!");
						return ret;
					}    

					boff = off;
					while (boff < off+3)
					{
						u_int bpos = GET_TERM_POS( pout->wsbtermpos[boff] );
						u_int blen = GET_TERM_LEN( pout->wsbtermpos[boff] );

						strncpy( subtokens[ret].buffer, pout->wordsepbuf+bpos, blen );
						subtokens[ret].buffer[blen] = '\0';
						subtokens[ret].offset = boff;
						subtokens[ret].length = blen;
						subtokens[ret].prop.m_hprop = pout->wsbtermprop[boff].m_hprop;
						subtokens[ret].prop.m_lprop = pout->wsbtermprop[boff].m_lprop;
						subtokens[ret].weight = 0;//pout没有weight和index字段,赋初值为0
						subtokens[ret].index = 0;

						boff += 1;
						ret += 1;
					}
					continue;
				}

				if(ret + 1 > tcnt)
				{
					WARNING_LOG("error : term result number more than max term count!");
					return ret;
				}  
				boff = off;
				u_int bpos = GET_TERM_POS( pout->wsbtermpos[boff] );
				u_int blen = GET_TERM_LEN( pout->wsbtermpos[boff] );

				strncpy( subtokens[ret].buffer, pout->wordsepbuf+bpos, blen );
				subtokens[ret].buffer[blen] = '\0';
				subtokens[ret].offset = boff;
				subtokens[ret].length = blen;
				subtokens[ret].prop.m_hprop = pout->wsbtermprop[boff].m_hprop;
				subtokens[ret].prop.m_lprop = pout->wsbtermprop[boff].m_lprop;
				subtokens[ret].weight = 0;//pout没有weight和index字段,赋初值为0
				subtokens[ret].index = 0;

				ret += 1;

				//build complete seg from subphrase buffer        
				rlen = len - 4;
				boff = off + 1;
				while (rlen > 0)
				{
					u_int lentemp = len;
					int maxsubphridx = -1;
					unsigned int maxsubphrlen = 0;
					while(1)
					{
						for (int j = 0; j < (int)pout->spbtermcount; j++)
						{
							//u_int spos = GET_TERM_POS( pout->spbtermpos[j] );
							u_int slen = GET_TERM_LEN( pout->spbtermpos[j] );

							//pass not aligned subphrase
							if (pout->spbtermoffsets[j] != boff)
							{
								continue;
							}

							//pass itself,or longer than it 
							if (slen >= len || slen >= lentemp)
							{    
								continue;    
							}
							if (maxsubphrlen < slen)
							{
								maxsubphridx = j;
								maxsubphrlen = slen;
							}                
						}
						if (maxsubphridx >= 0)
						{
							u_int spos = GET_TERM_POS( pout->spbtermpos[maxsubphridx] );
							u_int slen = GET_TERM_LEN( pout->spbtermpos[maxsubphridx] );

							char sterm[256] = "\0";
							if(slen >= 256)
							{
								WARNING_LOG("error: term length longer than 256!");
								return ret;
							}
							strncpy( sterm, pout->subphrbuf+spos, slen );
							sterm[slen] = '\0';

							//pass subphrase can split 
							creat_sign_f64 (sterm, slen, &(snode.sign1), &(snode.sign2));
							ds_ret = ds_op1 (phrasedict, &snode, SEEK);
							if ( ds_ret == 1)
							{
								lentemp = slen;
								//   j = -1;
								maxsubphridx = -1;
								maxsubphrlen = 0;
								continue;
							}
							else if(ds_ret != 0)
							{
								WARNING_LOG("error: ds_op1 result error!");
								return ret;
							}
							else
							{
								break;
							}
						}
						break;
					}

					//copy max subphrase
					if (maxsubphridx >= 0)
					{					
						if(ret + 1 > tcnt)
						{
							WARNING_LOG("error : term result number more than max term count!");
							return ret;
						}
						u_int mpos = GET_TERM_POS( pout->spbtermpos[maxsubphridx] );
						u_int mlen = GET_TERM_LEN( pout->spbtermpos[maxsubphridx] );
						u_int moff = pout->spbtermoffsets[maxsubphridx];

						strncpy( subtokens[ret].buffer, pout->subphrbuf+mpos, mlen );
						subtokens[ret].buffer[mlen] = '\0';
						subtokens[ret].offset = moff;
						subtokens[ret].length = mlen;
						subtokens[ret].prop.m_hprop = pout->spbtermprop[maxsubphridx].m_hprop;
						subtokens[ret].prop.m_lprop = pout->spbtermprop[maxsubphridx].m_lprop;
						subtokens[ret].weight = 0;//pout没有weight和index字段,赋初值为0
						subtokens[ret].index = 0;

						ret += 1;									
						rlen -= mlen;
						int bcnt = 1;
						mlen -= GET_TERM_LEN( pout->wsbtermpos[moff] );
						while (mlen > 0)
						{
							bcnt += 1;
							moff += 1;
							mlen -= GET_TERM_LEN( pout->wsbtermpos[moff] );
						}

						boff += bcnt;
					}
					//copy basic
					else
					{		
						if(ret + 1 > tcnt)
						{
							WARNING_LOG("error : term result number more than max term count!");
							return ret;
						}
						u_int bpos = GET_TERM_POS( pout->wsbtermpos[boff] );
						u_int blen = GET_TERM_LEN( pout->wsbtermpos[boff] );

						strncpy( subtokens[ret].buffer, pout->wordsepbuf+bpos, blen );
						subtokens[ret].buffer[blen] = '\0';
						subtokens[ret].offset = boff;
						subtokens[ret].length = blen;
						subtokens[ret].prop.m_hprop = pout->wsbtermprop[boff].m_hprop;
						subtokens[ret].prop.m_lprop = pout->wsbtermprop[boff].m_lprop;
						subtokens[ret].weight = 0;//pout没有weight和index字段,赋初值为0
						subtokens[ret].index = 0;

						ret += 1;				
						rlen -= blen;
						boff += 1;
					}
				}

				if(ret + 1 > tcnt)
				{
					WARNING_LOG("error : term result number more than max term count!");
					return ret;
				}
				bpos = GET_TERM_POS( pout->wsbtermpos[boff] );
				blen = GET_TERM_LEN( pout->wsbtermpos[boff] );

				strncpy( subtokens[ret].buffer, pout->wordsepbuf+bpos, blen );
				subtokens[ret].buffer[blen] = '\0';
				subtokens[ret].offset = boff;
				subtokens[ret].length = blen;
				subtokens[ret].prop.m_hprop = pout->wsbtermprop[boff].m_hprop;
				subtokens[ret].prop.m_lprop = pout->wsbtermprop[boff].m_lprop;
				subtokens[ret].weight = 0;//pout没有weight和index字段,赋初值为0
				subtokens[ret].index = 0;

				ret += 1;
				continue;
			}
		}

		creat_sign_f64 (term, len, &(snode.sign1), &(snode.sign2));
		if (ds_op1 (phrasedict, &snode, SEEK) != 1)
		{
			if(ret + 1 > tcnt)
			{
				WARNING_LOG("error : term result number more than max term count!");
				return ret;
			}

			strncpy( subtokens[ret].buffer, term, len );
			subtokens[ret].buffer[len] = '\0';
			subtokens[ret].offset = off;
			subtokens[ret].length = len;
			subtokens[ret].prop.m_hprop = pout->wpbtermprop[i].m_hprop;
			subtokens[ret].prop.m_lprop = pout->wpbtermprop[i].m_lprop;
			subtokens[ret].weight = 0;//pout没有weight和index字段,赋初值为0
			subtokens[ret].index = 0;

			ret += 1;	  	
			continue;
		}

		u_int rlen = len;
		u_int boff = off;
		while (rlen > 0)
		{
			u_int lentemp = len;//phrase itself length
			int maxsubphridx = -1;
			unsigned int maxsubphrlen = 0;
			while(1)
			{
				for (int j = 0; j < (int)pout->spbtermcount; j++)
				{
					//u_int spos = GET_TERM_POS( pout->spbtermpos[j] );
					u_int slen = GET_TERM_LEN( pout->spbtermpos[j] );

					//pass not aligned subphrase
					if (pout->spbtermoffsets[j] != boff)
					{
						continue;
					}

					//pass itself	
					if (slen >= len || slen >= lentemp)
					{
						continue;	
					}
					if (maxsubphrlen < slen)
					{
						maxsubphridx = j;
						maxsubphrlen = slen;
					}		
				}
				if (maxsubphridx >= 0)
				{
					u_int spos = GET_TERM_POS( pout->spbtermpos[maxsubphridx] );
					u_int slen = GET_TERM_LEN( pout->spbtermpos[maxsubphridx] );

					char sterm[256] = "\0";
					if(slen >= 256)
					{
						WARNING_LOG("error: term length longer than 256!");
						return ret;
					}
					strncpy( sterm, pout->subphrbuf+spos, slen );
					sterm[slen] = '\0';

					//pass subphrase can split 
					creat_sign_f64 (sterm, slen, &(snode.sign1), &(snode.sign2));
					ds_ret = ds_op1 (phrasedict, &snode, SEEK);
					if ( ds_ret == 1)
					{
						lentemp = slen;
						//j = -1;
						maxsubphridx = -1;
						maxsubphrlen = 0;
						continue;
					}
					else if(ds_ret != 0)
					{
						WARNING_LOG("error: ds_op1 result error!");
						return ret;
					}
					else
					{
						break;
					}
				}
				break;
			}

			//copy max subphrase
			if (maxsubphridx >= 0)
			{					              
				if(ret + 1 > tcnt)
				{
					WARNING_LOG("error : term result number more than max term count!");
					return ret;
				}
				int mpos = GET_TERM_POS( pout->spbtermpos[maxsubphridx] );
				int mlen = GET_TERM_LEN( pout->spbtermpos[maxsubphridx] );
				int moff = pout->spbtermoffsets[maxsubphridx];

				strncpy( subtokens[ret].buffer, pout->subphrbuf+mpos, mlen );
				subtokens[ret].buffer[mlen] = '\0';
				subtokens[ret].offset = moff;
				subtokens[ret].length = mlen;
				subtokens[ret].prop.m_hprop = pout->spbtermprop[maxsubphridx].m_hprop;
				subtokens[ret].prop.m_lprop = pout->spbtermprop[maxsubphridx].m_lprop;
				subtokens[ret].weight = 0;//pout没有weight和index字段,赋初值为0
				subtokens[ret].index = 0;
				ret += 1;				
				rlen -= mlen;

				int sbcnt = 1;
				mlen -= GET_TERM_LEN( pout->wsbtermpos[moff] );//获得basic个数
				while (mlen > 0)
				{
					sbcnt += 1;
					moff += 1;
					mlen -= GET_TERM_LEN( pout->wsbtermpos[moff] );
				}

				boff += sbcnt;
			}
			//copy basic
			else
			{			
				if(ret + 1 > tcnt)
				{
					WARNING_LOG("error : term result number more than max term count!");
					return ret;
				}
				int bpos = GET_TERM_POS( pout->wsbtermpos[boff] );
				int blen = GET_TERM_LEN( pout->wsbtermpos[boff] );

				strncpy( subtokens[ret].buffer, pout->wordsepbuf+bpos, blen );
				subtokens[ret].buffer[blen] = '\0';
				subtokens[ret].offset = boff;
				subtokens[ret].length = blen;
				subtokens[ret].prop.m_hprop = pout->wsbtermprop[boff].m_hprop;
				subtokens[ret].prop.m_lprop = pout->wsbtermprop[boff].m_lprop;
				subtokens[ret].weight = 0;//pout没有weight和index字段,赋初值为0
				subtokens[ret].index = 0;

				ret += 1;				
				rlen -= blen;
				boff += 1;
			}
		}
	}

	return ret;
}
Ejemplo n.º 5
0
//
// 混排结果中短语动态拆分,得到更合理的语义粒度,其粒度介于混排和基本词之间。
//新接口的调用函数
int seg_split_tokenize( Sdict_search* phrasedict, handle_t handle, token_t token, token_t subtokens[], int tcnt)
{
	int ret = 0;
	Sdict_snode snode;
	token_t subphrase[256];//这样的话,用户还怎么指定啊?
	token_t basic[256];
	int ds_ret = 0;

	if (token.length <= 2)
	{
		memcpy( subtokens, &token, sizeof( token_t ) );
		ret = 1;
		return ret;
	}

	//book name combine
	//    char *p = token.buffer;
	//   if (strncmp( p, "《", 2 ) == 0 && strncmp( p+token.length-2, "》", 2 ) == 0)
	int subcnt = seg_tokenize(handle, token, TOK_SUBPHR,subphrase,256);
	int bsccnt = seg_tokenize(handle, token, TOK_BASIC, basic, 256);

	if(IS_BOOKNAME(token.prop) && IS_BOOKNAME_START(basic[0].prop) && IS_BOOKNAME_END(basic[bsccnt - 1].prop))
	{
		//     int subcnt = seg_tokenize(handle, token, TOK_SUBPHR,subphrase,256);
		//     int bsccnt = seg_tokenize(handle, token, TOK_BASIC, basic, 256);

		if (bsccnt == 3)
		{
			if (ret + 3 > tcnt)
			{
				WARNING_LOG("error : term result number more than max term count!");
				return ret;
			}    
			memcpy( subtokens+ret, basic, sizeof( token_t ) );
			ret += 1;
			memcpy( subtokens+ret, basic+1, sizeof( token_t ) );
			ret += 1;
			memcpy( subtokens+ret, basic+2, sizeof( token_t ) );
			ret += 1;

			return ret;
		}

		if (ret + 1 > tcnt)
		{
			WARNING_LOG("error : term result number more than max term count!");
			return ret;
		}
		memcpy( subtokens+ret, basic, sizeof( token_t ) );
		ret += 1;

		//build complete seg from subphrase buffer        
		unsigned int len = token.length-4;
		unsigned int off = 2;
		while (len > 0)
		{
			u_int splitlen = token.length;//最大拆分phrase的长度
			u_int tmplen = token.length;
			int maxsubphridx = -1;
			unsigned int maxsubphrlen = 0;
			while(1)
			{
				for (int i = 0; i < subcnt; i++)
				{
					//pass not aligned subphrase
					if (subphrase[i].offset != off)
						continue;

					//pass itself    
					if (subphrase[i].length >= tmplen || subphrase[i].length >= splitlen)
						continue;    

					if (maxsubphrlen < subphrase[i].length)
					{
						maxsubphridx = i;
						maxsubphrlen = subphrase[i].length;
					}
				}
				if(maxsubphridx >= 0)
				{
					//pass subphrase can split 
					creat_sign_f64 (subphrase[maxsubphridx].buffer, subphrase[maxsubphridx].length, &(snode.sign1), &(snode.sign2));
					ds_ret = ds_op1 (phrasedict, &snode, SEEK);
					if (ds_ret == 1)
					{
						splitlen = maxsubphrlen;
						maxsubphridx = -1;
						maxsubphrlen = 0;
						continue;
					}
					else if(ds_ret != 0)
					{
						WARNING_LOG("error: ds_op1 result error!");
						return ret;
					}
					else
					{
						break;
					}	
				}
				break;
			}
			//copy max subphrase
			if (maxsubphridx >= 0)
			{
				if (ret + 1 > tcnt)
				{
					WARNING_LOG("error : term result number more than max term count!");
					return ret;
				}
				memcpy( subtokens+ret, subphrase+maxsubphridx, sizeof( token_t ) );
				ret += 1;                               
				len -= subphrase[maxsubphridx].length;
				off += subphrase[maxsubphridx].length;
			}
			//copy basic
			else
			{
				if (ret + 1 > tcnt)
				{
					WARNING_LOG("error : term result number more than max term count!");
					return ret;
				}

				int boff = 0;
				while (boff < bsccnt && basic[boff].offset != off)
				{
					boff += 1;
				}       

				if((boff >= bsccnt) || (basic[boff].offset != off))
				{
					WARNING_LOG("basic offset exceeds basic count");
					return ret;
				}
				//assert (boff < bsccnt );
				//assert (basic[boff].offset == off);                

				memcpy( subtokens+ret, basic+boff, sizeof( token_t ) );

				ret += 1;                
				len -= basic[boff].length;
				off += basic[boff].length;
			}    
		}
		if (ret + 1 > tcnt)
		{
			WARNING_LOG("error : term result number more than max term count!");
			return ret;
		}
		memcpy( subtokens+ret, basic+bsccnt-1, sizeof( token_t ) );
		ret += 1;

		return ret;
	}

	creat_sign_f64 (token.buffer, token.length, &(snode.sign1), &(snode.sign2));
	if (ds_op1 (phrasedict, &snode, SEEK) != 1)
	{
		memcpy( subtokens, &token, sizeof( token_t ) );
		ret = 1;
		return ret;
	}

	//  int subcnt = seg_tokenize(handle, token,TOK_SUBPHR,subphrase,32);
	//  int bsccnt = seg_tokenize(handle, token, TOK_BASIC, basic, 32);

	if (bsccnt == 1)//unexpect : phrasedict error
	{
		memcpy( subtokens, &token, sizeof( token_t ) );
		ret = 1;
		return ret;
	}

	//split phrase
	unsigned int len = token.length;
	unsigned int off = 0;
	while (len > 0)
	{
		u_int splitlen = token.length;
		u_int tmplen = token.length;
		int maxsubphridx = -1;
		unsigned int maxsubphrlen = 0;
		while(1)
		{
			for (int i = 0; i < subcnt; i++)
			{
				//pass not aligned subphrase
				if (subphrase[i].offset != off)
					continue;

				//pass itself    
				if (subphrase[i].length >= tmplen || subphrase[i].length >= splitlen)
					continue;     

				if (maxsubphrlen < subphrase[i].length)
				{
					maxsubphridx = i;
					maxsubphrlen = subphrase[i].length;
				}
			}
			if(maxsubphridx >= 0)
			{
				//pass subphrase can split 
				creat_sign_f64 (subphrase[maxsubphridx].buffer, subphrase[maxsubphridx].length, &(snode.sign1), &(snode.sign2));
				ds_ret = ds_op1 (phrasedict, &snode, SEEK);
				if ( ds_ret == 1)    
				{
					splitlen = maxsubphrlen;
					maxsubphridx = -1;
					maxsubphrlen = 0;
					continue;
				} 
				else if(ds_ret != 0)
				{
					WARNING_LOG("error: ds_op1 result error!");
					return ret;
				}
				else
				{
					break;
				}	
			}
			break;
		}

		//copy max subphrase
		if (maxsubphridx >= 0)
		{         
			if (ret + 1 > tcnt)
			{
				WARNING_LOG("error : term result number more than max term count!");
				return ret;
			}
			memcpy( subtokens+ret, subphrase+maxsubphridx, sizeof( token_t ) );
			ret += 1;                
			len -= subphrase[maxsubphridx].length;
			off += subphrase[maxsubphridx].length;
		}
		//copy basic
		else
		{
			if (ret + 1 > tcnt)
			{
				WARNING_LOG("error : term result number more than max term count!");
				return ret;
			}
			int boff = 0;
			while (boff < bsccnt && basic[boff].offset != off)
			{
				boff += 1;
			}       
			if((boff >= bsccnt) || (basic[boff].offset != off))
			{
				WARNING_LOG("basic offset exceeds basic count");
				return ret;
			}
			//assert (boff < bsccnt );
			//assert (basic[boff].offset == off);                

			memcpy( subtokens+ret, basic+boff, sizeof( token_t ) );
			ret += 1;

			len -= basic[boff].length;
			off += basic[boff].length;
		}    
	}

	return ret;
}