int segment_words(const char * content,enum wtype type,int len,bool stepword,vector<funit>& features) { if(!content || len <= 0){ UB_LOG_WARNING("segment words,parameter error"); return -1; } /*get word segment result buffer*/ thread_data * tsd = (thread_data *)pthread_getspecific(key); if(!tsd){ UB_LOG_FATAL("thread special data is null"); exit(0); } scw_out_t *pout = tsd->pout; /*word segment*/ if(scw_segment_words(pwdict,pout,content,len,LANGTYPE_SIMP_CHINESE,NULL) == -1){ UB_LOG_WARNING("scw segment words failed"); return -1; } /*get result to vectore features*/ int i,count; token_t tokens[1024]; funit tmp; /*word type,we just need SCW_OUT_WPCOMP*/ u_int tsco[5] = {SCW_OUT_WPCOMP,SCW_OUT_BASIC,SCW_OUT_SUBPH, SCW_OUT_HUMANNAME,SCW_OUT_BOOKNAME}; /*just SCW_OUT_WPCOMP mode,so j < 1*/ for(int j = 0;j < 1;j ++) { count = scw_get_token_1(pout,tsco[j],tokens,1024); for(i = 0;i < count;i ++) { /*filter space and special punc*/ trim_string(tokens[i].buffer); if(strlen(tokens[i].buffer) <= 1) continue; tmp.feature = tokens[i].buffer; tmp.weight = 1; features.push_back(tmp); } } /*get weight*/ feature_weight(features,type); /*output result for debug*/ for(i = 0;i < (int)features.size();i++) { tmp = features.at(i); UB_LOG_DEBUG("word[%s] weight[%f]",tmp.feature.c_str(),tmp.weight); } return 0; }
int main(int argc,char** argv) { scw_worddict_t * pwdict; scw_out_t *pout; char line[1024000]; u_int scw_out_flag; int flag = 0; if(argc!= 3 ) { fprintf(stderr, "usage: %s worddict_dir outtype\n", argv[0]); exit(-1); } if((pwdict=scw_load_worddict(argv[1]))==NULL) { fprintf(stderr,"Load worddict failed.Filename=worddict/bin/"); return 1; } flag = atoi(argv[0]); scw_out_flag = SCW_OUT_ALL | SCW_OUT_PROP; if((pout=scw_create_out(80000, scw_out_flag))==NULL) { fprintf(stderr,"Init the output buffer error.\n"); return -1; } while(fgets(line,sizeof(line),stdin)) { int len=strlen(line); while((line[len-1]=='\r') ||(line[len-1]=='\n')) line[--len]=0; if(scw_segment_words(pwdict,pout,line,len)<0) { fprintf(stderr, "query %s error\n", line); scw_destroy_out(pout); return -1; } scw_dump_out2(pout,flag,pwdict->m_wdtype); } return 0; }