void MainWindow::textBagOfWords(std::set<std::string> &featureDic, std::map<std::string, int> &text,int &textSize) { if(!ICTCLAS_Init()) //初始化分词组件。 { QMessageBox::warning(this,"Warnning","Init fails",QMessageBox::Yes); return; } else { printf("Init ok\n"); } ICTCLAS_SetPOSmap(2); QString myStr = ui->text->toPlainText(); QByteArray ba = myStr.toLocal8Bit(); char* sText; char* sSentence = (char*)malloc(ba.size()+10); sText = ba.data(); int len = 0; //这样读入为了将换行符去掉,ASCII码10、13 for(int i = 0; sText[i] != '\0'; i++) { if(sText[i] == '\n' || sText[i] == '\r')continue; sSentence[len++] = sText[i]; } sSentence[len] = '\0'; unsigned int nPaLen=strlen(sSentence); // 需要分词的长度 char* sRst=0; //用户自行分配空间,用于保存结果; sRst=(char*)malloc(nPaLen*6); //建议长度为字符串长度的6倍。 int nRstLen=0; //分词结果的长度 nRstLen = ICTCLAS_ParagraphProcess(sSentence,nPaLen,sRst,CODE_TYPE_UNKNOWN,0); //字符串处理 //free(sText); free(sSentence); //收集单词,形成字典 std::string words; std::istringstream istream(sRst); std::set<std::string> txtWords; while(istream>>words) { txtWords.insert(words); if(featureDic.count(words)) { text[words]++; } } textSize = txtWords.size(); free(sRst); txtWords.clear(); istream.clear(); ICTCLAS_Exit(); //释放资源退出 return; }
int main(int argc, char **argv) { int c; /* 默认参数设置 */ char *httpcws_settings_listen = "0.0.0.0"; int httpcws_settings_port = 1985; char *httpcws_settings_datapath = NULL; /*中文词典数据库路径 */ bool httpcws_settings_daemon = false; int httpcws_settings_timeout = 120; /* 单位:秒 */ /* process arguments */ while ((c = getopt(argc, argv, "l:p:x:t:dh")) != -1) { switch (c) { case 'l': httpcws_settings_listen = strdup(optarg); break; case 'p': httpcws_settings_port = atoi(optarg); break; case 'x': httpcws_settings_datapath = strdup(optarg); /* 词库文件存储路径 */ break; case 't': httpcws_settings_timeout = atoi(optarg); break; case 'd': httpcws_settings_daemon = true; break; case 'h': default: show_help(); return 1; } } /* 判断是否加了必填参数 -x */ if (httpcws_settings_datapath == NULL) { show_help(); fprintf(stderr, "Attention: Please use the indispensable argument: -x <path>\n\n"); exit(1); } /* 初始化分词组件 */ if(!ICTCLAS_Init(httpcws_settings_datapath)) { printf("%s\n", httpcws_settings_datapath); fprintf(stderr, "ERROR: Count not open the Chinese dictionary!\n\n"); exit(1); } ICTCLAS_SetPOSmap(ICT_POS_MAP_SECOND); fprintf(stderr, "Loading Chinese dictionary 'httpcws_dict.txt' into memory, please waitting ......\n"); char *httpcws_settings_dataname = (char *)malloc(1024); memset (httpcws_settings_dataname, '\0', 1024); sprintf(httpcws_settings_dataname, "%s/httpcws_dict.txt", httpcws_settings_datapath); int nCount = ICTCLAS_ImportUserDict(httpcws_settings_dataname); ICTCLAS_SaveTheUsrDic(); free(httpcws_settings_dataname); printf("OK! %d words has loaded into memory.\n\n", nCount); printf("HTTPCWS Server running on %s:%d\n", httpcws_settings_listen, httpcws_settings_port); /* 如果加了-d参数,以守护进程运行 */ if (httpcws_settings_daemon == true){ pid_t pid; /* Fork off the parent process */ pid = fork(); if (pid < 0) { exit(EXIT_FAILURE); } /* If we got a good PID, then we can exit the parent process. */ if (pid > 0) { exit(EXIT_SUCCESS); } } /* 请求处理部分 */ struct evhttp *httpd; event_init(); httpd = evhttp_start(httpcws_settings_listen, httpcws_settings_port); evhttp_set_timeout(httpd, httpcws_settings_timeout); /* Set a callback for all other requests. */ evhttp_set_gencb(httpd, httpcws_handler, NULL); event_dispatch(); /* Not reached in this code as it is now. */ evhttp_free(httpd); return 0; }
void SplitGBK(const char *sInput) {//分词演示 //初始化分词组件 if(!ICTCLAS_Init())//数据在当前路径下,默认为GBK编码的分词 { printf("ICTCLAS INIT FAILED!\n"); return ; } ICTCLAS_SetPOSmap(ICT_POS_MAP_SECOND); char sSentence[2000]="三枪拍案惊奇的主创人员包括孙红雷、小沈阳、闫妮等,导演为张艺谋"; const char * sResult; int nCount; ICTCLAS_ParagraphProcessA(sSentence,&nCount); printf("nCount=%d\n",nCount); ICTCLAS_AddUserWord("孙红雷 yym");//添加孙红雷,作为演员名称 sResult = ICTCLAS_ParagraphProcess(sSentence,1); printf("%s\n", sResult); ICTCLAS_AddUserWord("小沈阳 yym");//添加小沈阳,作为演员名称 sResult = ICTCLAS_ParagraphProcess(sSentence,1); printf("%s\n", sResult); ICTCLAS_AddUserWord("闫妮 yym");//添加闫妮,作为演员名称 sResult = ICTCLAS_ParagraphProcess(sSentence,1); printf("%s\n", sResult); ICTCLAS_AddUserWord("三枪拍案惊奇 dym");//添加三枪拍案惊奇,作为电影名称 sResult = ICTCLAS_ParagraphProcess(sSentence,1); printf("%s\n", sResult); while(_stricmp(sSentence,"q")!=0) { sResult = ICTCLAS_ParagraphProcess(sSentence,0); printf("%s\nInput string now('q' to quit)!\n", sResult); scanf("%s",sSentence); } //导入用户词典前 printf("未导入用户词典:\n"); sResult = ICTCLAS_ParagraphProcess(sInput, 0); printf("%s\n", sResult); //导入用户词典后 printf("\n导入用户词典后:\n"); nCount = ICTCLAS_ImportUserDict("userdic.txt");//userdic.txt覆盖以前的用户词典 //保存用户词典 ICTCLAS_SaveTheUsrDic(); printf("导入%d个用户词。\n", nCount); sResult = ICTCLAS_ParagraphProcess(sInput, 1); printf("%s\n", sResult); //动态添加用户词 printf("\n动态添加用户词后:\n"); ICTCLAS_AddUserWord("计算机学院 xueyuan"); ICTCLAS_SaveTheUsrDic(); sResult = ICTCLAS_ParagraphProcess(sInput, 1); printf("%s\n", sResult); //对文件进行分词 ICTCLAS_FileProcess("testGBK.txt","testGBK_result.txt",1); //释放分词组件资源 ICTCLAS_Exit(); }