void MainWindow::textBagOfWords(std::set<std::string> &featureDic, std::map<std::string, int> &text,int &textSize) { if(!ICTCLAS_Init()) //初始化分词组件。 { QMessageBox::warning(this,"Warnning","Init fails",QMessageBox::Yes); return; } else { printf("Init ok\n"); } ICTCLAS_SetPOSmap(2); QString myStr = ui->text->toPlainText(); QByteArray ba = myStr.toLocal8Bit(); char* sText; char* sSentence = (char*)malloc(ba.size()+10); sText = ba.data(); int len = 0; //这样读入为了将换行符去掉,ASCII码10、13 for(int i = 0; sText[i] != '\0'; i++) { if(sText[i] == '\n' || sText[i] == '\r')continue; sSentence[len++] = sText[i]; } sSentence[len] = '\0'; unsigned int nPaLen=strlen(sSentence); // 需要分词的长度 char* sRst=0; //用户自行分配空间,用于保存结果; sRst=(char*)malloc(nPaLen*6); //建议长度为字符串长度的6倍。 int nRstLen=0; //分词结果的长度 nRstLen = ICTCLAS_ParagraphProcess(sSentence,nPaLen,sRst,CODE_TYPE_UNKNOWN,0); //字符串处理 //free(sText); free(sSentence); //收集单词,形成字典 std::string words; std::istringstream istream(sRst); std::set<std::string> txtWords; while(istream>>words) { txtWords.insert(words); if(featureDic.count(words)) { text[words]++; } } textSize = txtWords.size(); free(sRst); txtWords.clear(); istream.clear(); ICTCLAS_Exit(); //释放资源退出 return; }
void SplitUTF8() { //初始化分词组件 if(!ICTCLAS_Init("",UTF8_CODE))//数据在当前路径下,设置为UTF8编码的分词 { printf("ICTCLAS INIT FAILED!\n"); return ; } ICTCLAS_FileProcess("testUTF.txt","testUTF_result.txt"); ICTCLAS_Exit(); }
void SplitBIG5() { //初始化分词组件 if(!ICTCLAS_Init("",BIG5_CODE))//数据在当前路径下,设置为BIG5编码的分词 { printf("ICTCLAS INIT FAILED!\n"); return ; } ICTCLAS_FileProcess("testBIG.txt","testBIG_result.txt"); ICTCLAS_Exit(); }
void FingerPrint(const char *sInput) { //初始化分词组件 if(!ICTCLAS_Init()) { printf("ICTCLAS INIT FAILED!\n"); return ; } //释放分词组件资源 ICTCLAS_Exit(); }
int main(int argc, char* argv[]) { //Sample1: Sentence or paragraph lexical analysis with only one result char sSentence[2000],sSentenceResult[5000]; ICTCLAS_Init(); printf("Input sentence now!\n"); scanf("%s",sSentence); while(_stricmp(sSentence,"q")!=0) { ICTCLAS_ParagraphProcess(sSentence,sSentenceResult); printf("%s\nInput string now!\n",sSentenceResult); scanf("%s",sSentence); } ICTCLAS_Exit(); //Sample2: File segmentation and POS tagging /* ICTCLAS_Init(); ICTCLAS_FileProcess("E:\\Sample\\Corpus_NewPOS\\199802_Org.txt","E:\\Sample\\Corpus_NewPOS\\199802_Org_cla.txt"); ICTCLAS_Exit(); */ //Sample3: Sentence segmentation and POS tagging with multiple result /* char sSentence[2000],**sSentenceResult; int i; sSentenceResult=new char*[5]; for(i=0;i<5;i++) sSentenceResult[i]=new char[5000]; ICTCLAS_Init(); printf("Input sentence now!\n"); scanf("%s",sSentence); while(_stricmp(sSentence,"q")!=0) { ICTCLAS_SentenceProcess(sSentence,5,sSentenceResult); for(i=0;i<5;i++) printf("Result%d:%s\n",i+1,sSentenceResult[i]); printf("Input string now!\n"); scanf("%s",sSentence); } ICTCLAS_Exit(); for(i=0;i<5;i++) delete[] sSentenceResult[i]; delete [] sSentenceResult; */ return 0; }
void KeyExtract(const char *sInput) { //初始化分词组件 if(!ICTCLAS_Init()) { printf("ICTCLAS INIT FAILED!\n"); return ; } int nCount = ICTCLAS_GetParagraphProcessAWordCount(sInput); //分词。提取关键词 result_t *result =(result_t*)malloc(sizeof(result_t)*nCount); ICTCLAS_ParagraphProcessAW(nCount,result);//获取结果存到客户的内存中 //指纹提取,须在ICTCLAS_ParagraphProcessAW函数执行完后执行 unsigned long lFinger = ICTCLAS_FingerPrint(); char buf[100]; memset(buf, 0, 100); sprintf(buf, "%x", lFinger); printf("%s\n", buf); //关键词提取,须在ICTCLAS_ParagraphProcessAW函数执行完后执行 result_t *resultKey = (result_t*)malloc(sizeof(result_t)*nCount); int nCountKey; ICTCLAS_KeyWord(resultKey, nCountKey); for (int i=0; i<nCountKey; i++) { char buf[100]; memset(buf, 0, 100); int index = resultKey[i].start; if( resultKey[i].weight>0) { memcpy(buf,(void *)(sInput+index), resultKey[i].length); printf("%s\t%d\n", buf, resultKey[i].weight); } } free(resultKey); free(result); //释放分词组件资源 ICTCLAS_Exit(); }
int main(int argc, char** argv) { //testNewWord(GBK_CODE); //testNewWord(UTF8_CODE); //初始化分词组件 char* inputFile = argv[1]; char* outputFile = argv[2]; printf("%s\n", inputFile); printf("%s\n", outputFile); if(!ICTCLAS_Init("..",UTF8_CODE))//数据在当前路径下,设置为UTF8编码的分词 { printf("ICTCLAS INIT FAILED!\n"); return NULL; } ICTCLAS_FileProcess(inputFile, outputFile, 0); ICTCLAS_Exit(); return 0; }
void SplitGBK(const char *sInput) {//分词演示 //初始化分词组件 if(!ICTCLAS_Init())//数据在当前路径下,默认为GBK编码的分词 { printf("ICTCLAS INIT FAILED!\n"); return ; } ICTCLAS_SetPOSmap(ICT_POS_MAP_SECOND); char sSentence[2000]="三枪拍案惊奇的主创人员包括孙红雷、小沈阳、闫妮等,导演为张艺谋"; const char * sResult; int nCount; ICTCLAS_ParagraphProcessA(sSentence,&nCount); printf("nCount=%d\n",nCount); ICTCLAS_AddUserWord("孙红雷 yym");//添加孙红雷,作为演员名称 sResult = ICTCLAS_ParagraphProcess(sSentence,1); printf("%s\n", sResult); ICTCLAS_AddUserWord("小沈阳 yym");//添加小沈阳,作为演员名称 sResult = ICTCLAS_ParagraphProcess(sSentence,1); printf("%s\n", sResult); ICTCLAS_AddUserWord("闫妮 yym");//添加闫妮,作为演员名称 sResult = ICTCLAS_ParagraphProcess(sSentence,1); printf("%s\n", sResult); ICTCLAS_AddUserWord("三枪拍案惊奇 dym");//添加三枪拍案惊奇,作为电影名称 sResult = ICTCLAS_ParagraphProcess(sSentence,1); printf("%s\n", sResult); while(_stricmp(sSentence,"q")!=0) { sResult = ICTCLAS_ParagraphProcess(sSentence,0); printf("%s\nInput string now('q' to quit)!\n", sResult); scanf("%s",sSentence); } //导入用户词典前 printf("未导入用户词典:\n"); sResult = ICTCLAS_ParagraphProcess(sInput, 0); printf("%s\n", sResult); //导入用户词典后 printf("\n导入用户词典后:\n"); nCount = ICTCLAS_ImportUserDict("userdic.txt");//userdic.txt覆盖以前的用户词典 //保存用户词典 ICTCLAS_SaveTheUsrDic(); printf("导入%d个用户词。\n", nCount); sResult = ICTCLAS_ParagraphProcess(sInput, 1); printf("%s\n", sResult); //动态添加用户词 printf("\n动态添加用户词后:\n"); ICTCLAS_AddUserWord("计算机学院 xueyuan"); ICTCLAS_SaveTheUsrDic(); sResult = ICTCLAS_ParagraphProcess(sInput, 1); printf("%s\n", sResult); //对文件进行分词 ICTCLAS_FileProcess("testGBK.txt","testGBK_result.txt",1); //释放分词组件资源 ICTCLAS_Exit(); }