示例#1
0
int segment_words(const char * content,enum wtype type,int len,bool stepword,vector<funit>& features)
{
	if(!content || len <= 0){
		UB_LOG_WARNING("segment words,parameter error");
		return -1;
	}

    /*get word segment result buffer*/
    thread_data * tsd = (thread_data *)pthread_getspecific(key);
    if(!tsd){
        UB_LOG_FATAL("thread special data is null");
        exit(0);
    }
    scw_out_t *pout = tsd->pout;

	/*word segment*/
	if(scw_segment_words(pwdict,pout,content,len,LANGTYPE_SIMP_CHINESE,NULL) == -1){
		UB_LOG_WARNING("scw segment words failed");
		return -1;
	}

	/*get result to vectore features*/
	int i,count;
	token_t tokens[1024];
	funit tmp;
    
    /*word type,we just need SCW_OUT_WPCOMP*/
	u_int tsco[5] = {SCW_OUT_WPCOMP,SCW_OUT_BASIC,SCW_OUT_SUBPH,
		SCW_OUT_HUMANNAME,SCW_OUT_BOOKNAME};
    
	/*just SCW_OUT_WPCOMP mode,so j < 1*/
	for(int j = 0;j < 1;j ++)
	{
		count = scw_get_token_1(pout,tsco[j],tokens,1024);
		for(i = 0;i < count;i ++)
		{
			/*filter space and special punc*/
			trim_string(tokens[i].buffer);
			if(strlen(tokens[i].buffer) <= 1)
				continue;
            
            tmp.feature = tokens[i].buffer;
			tmp.weight = 1;
			features.push_back(tmp);
		}
	}

    /*get weight*/
    feature_weight(features,type);

	/*output result for debug*/
	for(i = 0;i < (int)features.size();i++)
	{
		tmp = features.at(i);
		UB_LOG_DEBUG("word[%s] weight[%f]",tmp.feature.c_str(),tmp.weight);
	}

	return 0;
}
示例#2
0
int main(int argc,char** argv)
{
  scw_worddict_t * pwdict;
  scw_out_t *pout;
  char line[1024000];
  u_int scw_out_flag;
  int flag = 0;

  if(argc!= 3 )
  {
    fprintf(stderr, "usage: %s worddict_dir outtype\n", argv[0]);
    exit(-1);
  }

  if((pwdict=scw_load_worddict(argv[1]))==NULL)
  {
    fprintf(stderr,"Load worddict failed.Filename=worddict/bin/");
    return 1;
  }

  flag = atoi(argv[0]);

  scw_out_flag = SCW_OUT_ALL | SCW_OUT_PROP;
  if((pout=scw_create_out(80000, scw_out_flag))==NULL)
  {
    fprintf(stderr,"Init the output buffer error.\n");
    return -1;
  }

  while(fgets(line,sizeof(line),stdin))
  {    
    int len=strlen(line);
    while((line[len-1]=='\r') ||(line[len-1]=='\n'))
      line[--len]=0;
    
    if(scw_segment_words(pwdict,pout,line,len)<0)
    {
      fprintf(stderr, "query %s error\n", line);
      scw_destroy_out(pout);
      return -1;
    }
    scw_dump_out2(pout,flag,pwdict->m_wdtype);
  }

  return 0;
}