Exemple #1
0
void MainWindow::textBagOfWords(std::set<std::string> &featureDic, std::map<std::string, int> &text,int &textSize)
{
    if(!ICTCLAS_Init()) //初始化分词组件。
    {
            QMessageBox::warning(this,"Warnning","Init fails",QMessageBox::Yes);
            return;
    }
    else
    {
            printf("Init ok\n");
    }
    ICTCLAS_SetPOSmap(2);

    QString myStr = ui->text->toPlainText();
    QByteArray ba = myStr.toLocal8Bit();
    char* sText;
    char* sSentence = (char*)malloc(ba.size()+10);
    sText = ba.data();

    int len = 0;
    //这样读入为了将换行符去掉,ASCII码10、13
    for(int i = 0; sText[i] != '\0'; i++)
    {
        if(sText[i] == '\n' || sText[i] == '\r')continue;
        sSentence[len++] = sText[i];
    }
    sSentence[len] = '\0';

    unsigned int nPaLen=strlen(sSentence); // 需要分词的长度
    char* sRst=0;   //用户自行分配空间,用于保存结果;
    sRst=(char*)malloc(nPaLen*6); //建议长度为字符串长度的6倍。

    int nRstLen=0; //分词结果的长度

    nRstLen = ICTCLAS_ParagraphProcess(sSentence,nPaLen,sRst,CODE_TYPE_UNKNOWN,0);  //字符串处理

    //free(sText);
    free(sSentence);

    //收集单词,形成字典
    std::string words;
    std::istringstream istream(sRst);
    std::set<std::string> txtWords;
    while(istream>>words)
    {
        txtWords.insert(words);
        if(featureDic.count(words))
        {
            text[words]++;
        }
    }
    textSize = txtWords.size();
    free(sRst);
    txtWords.clear();
    istream.clear();
    ICTCLAS_Exit();	//释放资源退出
    return;

}
Exemple #2
0
void SplitUTF8()
{
	//初始化分词组件
	if(!ICTCLAS_Init("",UTF8_CODE))//数据在当前路径下,设置为UTF8编码的分词
	{
		printf("ICTCLAS INIT FAILED!\n");
		return ;
	}
	ICTCLAS_FileProcess("testUTF.txt","testUTF_result.txt");
	ICTCLAS_Exit();
}
Exemple #3
0
void SplitBIG5()
{
	//初始化分词组件
	if(!ICTCLAS_Init("",BIG5_CODE))//数据在当前路径下,设置为BIG5编码的分词
	{
		printf("ICTCLAS INIT FAILED!\n");
		return ;
	}
	ICTCLAS_FileProcess("testBIG.txt","testBIG_result.txt");
	ICTCLAS_Exit();
}
void FingerPrint(const char *sInput)
{
	//初始化分词组件
	if(!ICTCLAS_Init())
	{
		printf("ICTCLAS INIT FAILED!\n");
		return ;
	}


	//释放分词组件资源
	ICTCLAS_Exit();
}
int main(int argc, char* argv[])
{
	//Sample1: Sentence or paragraph lexical analysis with only one result

    char sSentence[2000],sSentenceResult[5000];
	ICTCLAS_Init();
	printf("Input sentence now!\n");
	scanf("%s",sSentence);
	while(_stricmp(sSentence,"q")!=0)
	{
		ICTCLAS_ParagraphProcess(sSentence,sSentenceResult);
		printf("%s\nInput string now!\n",sSentenceResult);
		scanf("%s",sSentence);
	}
    ICTCLAS_Exit();

	//Sample2: File segmentation and POS tagging
/* 
    ICTCLAS_Init();	
	ICTCLAS_FileProcess("E:\\Sample\\Corpus_NewPOS\\199802_Org.txt","E:\\Sample\\Corpus_NewPOS\\199802_Org_cla.txt");
	ICTCLAS_Exit();
*/

	//Sample3: Sentence segmentation and POS tagging with multiple result
/* 
    char sSentence[2000],**sSentenceResult;
	int i;
	sSentenceResult=new char*[5];
	for(i=0;i<5;i++)
		sSentenceResult[i]=new char[5000];		

	ICTCLAS_Init();
	printf("Input sentence now!\n");
	scanf("%s",sSentence);
	while(_stricmp(sSentence,"q")!=0)
	{
		ICTCLAS_SentenceProcess(sSentence,5,sSentenceResult);
		for(i=0;i<5;i++)
			printf("Result%d:%s\n",i+1,sSentenceResult[i]);
		printf("Input string now!\n");
		scanf("%s",sSentence);
	}
    ICTCLAS_Exit();
	
	for(i=0;i<5;i++)
		delete[] sSentenceResult[i];		
	delete [] sSentenceResult;
*/
	return 0;
}
void KeyExtract(const char *sInput)
{
	//初始化分词组件
	if(!ICTCLAS_Init())
	{
		printf("ICTCLAS INIT FAILED!\n");
		return ;
	}

    int nCount = ICTCLAS_GetParagraphProcessAWordCount(sInput);
	//分词。提取关键词
	result_t *result =(result_t*)malloc(sizeof(result_t)*nCount);
	ICTCLAS_ParagraphProcessAW(nCount,result);//获取结果存到客户的内存中

	//指纹提取,须在ICTCLAS_ParagraphProcessAW函数执行完后执行
	unsigned long lFinger = ICTCLAS_FingerPrint();

	char buf[100];
	memset(buf, 0, 100);
	sprintf(buf, "%x", lFinger);
	printf("%s\n", buf);

	//关键词提取,须在ICTCLAS_ParagraphProcessAW函数执行完后执行
	result_t *resultKey = (result_t*)malloc(sizeof(result_t)*nCount);
	int nCountKey;
	ICTCLAS_KeyWord(resultKey, nCountKey);

	for (int i=0; i<nCountKey; i++)
	{
		char buf[100];
		memset(buf, 0, 100);
		int index = resultKey[i].start;
        if( resultKey[i].weight>0)
        {
            memcpy(buf,(void *)(sInput+index), resultKey[i].length);
            printf("%s\t%d\n", buf, resultKey[i].weight);
        }
	}

	free(resultKey);
	free(result);
    
	//释放分词组件资源
	ICTCLAS_Exit();
}
int main(int argc, char** argv)
{
	//testNewWord(GBK_CODE);
	//testNewWord(UTF8_CODE);
    //初始化分词组件
    char* inputFile = argv[1];
    char* outputFile = argv[2];
    printf("%s\n", inputFile);
    printf("%s\n", outputFile);

    if(!ICTCLAS_Init("..",UTF8_CODE))//数据在当前路径下,设置为UTF8编码的分词
    {
        printf("ICTCLAS INIT FAILED!\n");
        return NULL;
    }
   	ICTCLAS_FileProcess(inputFile, outputFile, 0);
  	ICTCLAS_Exit();
    return 0;
}
Exemple #8
0
void SplitGBK(const char *sInput)
{//分词演示

	//初始化分词组件
	if(!ICTCLAS_Init())//数据在当前路径下,默认为GBK编码的分词
	{
		printf("ICTCLAS INIT FAILED!\n");
		return ;
	}

	ICTCLAS_SetPOSmap(ICT_POS_MAP_SECOND);

	char sSentence[2000]="三枪拍案惊奇的主创人员包括孙红雷、小沈阳、闫妮等,导演为张艺谋";
	const char * sResult;

	int nCount;
	ICTCLAS_ParagraphProcessA(sSentence,&nCount);
	printf("nCount=%d\n",nCount);

	ICTCLAS_AddUserWord("孙红雷 yym");//添加孙红雷,作为演员名称
	sResult = ICTCLAS_ParagraphProcess(sSentence,1);
	printf("%s\n", sResult);
	ICTCLAS_AddUserWord("小沈阳 yym");//添加小沈阳,作为演员名称
	sResult = ICTCLAS_ParagraphProcess(sSentence,1);
	printf("%s\n", sResult);
	ICTCLAS_AddUserWord("闫妮 yym");//添加闫妮,作为演员名称
	sResult = ICTCLAS_ParagraphProcess(sSentence,1);
	printf("%s\n", sResult);
	ICTCLAS_AddUserWord("三枪拍案惊奇 dym");//添加三枪拍案惊奇,作为电影名称
	sResult = ICTCLAS_ParagraphProcess(sSentence,1);
	printf("%s\n", sResult);
	

	while(_stricmp(sSentence,"q")!=0)
	{
		sResult = ICTCLAS_ParagraphProcess(sSentence,0);
		printf("%s\nInput string now('q' to quit)!\n", sResult);
		scanf("%s",sSentence);
	}
	
	//导入用户词典前
	printf("未导入用户词典:\n");
	sResult = ICTCLAS_ParagraphProcess(sInput, 0);
	printf("%s\n", sResult);

	//导入用户词典后
	printf("\n导入用户词典后:\n");
	nCount = ICTCLAS_ImportUserDict("userdic.txt");//userdic.txt覆盖以前的用户词典
	//保存用户词典
	ICTCLAS_SaveTheUsrDic();
	printf("导入%d个用户词。\n", nCount);
	
	sResult = ICTCLAS_ParagraphProcess(sInput, 1);
	printf("%s\n", sResult);

	//动态添加用户词
	printf("\n动态添加用户词后:\n");
	ICTCLAS_AddUserWord("计算机学院   xueyuan");
	ICTCLAS_SaveTheUsrDic();
	sResult = ICTCLAS_ParagraphProcess(sInput, 1);
	printf("%s\n", sResult);


	//对文件进行分词
	ICTCLAS_FileProcess("testGBK.txt","testGBK_result.txt",1);


	//释放分词组件资源
	ICTCLAS_Exit();
}