예제 #1
0
void MainWindow::textBagOfWords(std::set<std::string> &featureDic, std::map<std::string, int> &text,int &textSize)
{
    if(!ICTCLAS_Init()) //初始化分词组件。
    {
            QMessageBox::warning(this,"Warnning","Init fails",QMessageBox::Yes);
            return;
    }
    else
    {
            printf("Init ok\n");
    }
    ICTCLAS_SetPOSmap(2);

    QString myStr = ui->text->toPlainText();
    QByteArray ba = myStr.toLocal8Bit();
    char* sText;
    char* sSentence = (char*)malloc(ba.size()+10);
    sText = ba.data();

    int len = 0;
    //这样读入为了将换行符去掉,ASCII码10、13
    for(int i = 0; sText[i] != '\0'; i++)
    {
        if(sText[i] == '\n' || sText[i] == '\r')continue;
        sSentence[len++] = sText[i];
    }
    sSentence[len] = '\0';

    unsigned int nPaLen=strlen(sSentence); // 需要分词的长度
    char* sRst=0;   //用户自行分配空间,用于保存结果;
    sRst=(char*)malloc(nPaLen*6); //建议长度为字符串长度的6倍。

    int nRstLen=0; //分词结果的长度

    nRstLen = ICTCLAS_ParagraphProcess(sSentence,nPaLen,sRst,CODE_TYPE_UNKNOWN,0);  //字符串处理

    //free(sText);
    free(sSentence);

    //收集单词,形成字典
    std::string words;
    std::istringstream istream(sRst);
    std::set<std::string> txtWords;
    while(istream>>words)
    {
        txtWords.insert(words);
        if(featureDic.count(words))
        {
            text[words]++;
        }
    }
    textSize = txtWords.size();
    free(sRst);
    txtWords.clear();
    istream.clear();
    ICTCLAS_Exit();	//释放资源退出
    return;

}
예제 #2
0
void SplitUTF8()
{
	//初始化分词组件
	if(!ICTCLAS_Init("",UTF8_CODE))//数据在当前路径下,设置为UTF8编码的分词
	{
		printf("ICTCLAS INIT FAILED!\n");
		return ;
	}
	ICTCLAS_FileProcess("testUTF.txt","testUTF_result.txt");
	ICTCLAS_Exit();
}
예제 #3
0
void SplitBIG5()
{
	//初始化分词组件
	if(!ICTCLAS_Init("",BIG5_CODE))//数据在当前路径下,设置为BIG5编码的分词
	{
		printf("ICTCLAS INIT FAILED!\n");
		return ;
	}
	ICTCLAS_FileProcess("testBIG.txt","testBIG_result.txt");
	ICTCLAS_Exit();
}
예제 #4
0
void FingerPrint(const char *sInput)
{
	//初始化分词组件
	if(!ICTCLAS_Init())
	{
		printf("ICTCLAS INIT FAILED!\n");
		return ;
	}


	//释放分词组件资源
	ICTCLAS_Exit();
}
예제 #5
0
int main(int argc, char* argv[])
{
	//Sample1: Sentence or paragraph lexical analysis with only one result

    char sSentence[2000],sSentenceResult[5000];
	ICTCLAS_Init();
	printf("Input sentence now!\n");
	scanf("%s",sSentence);
	while(_stricmp(sSentence,"q")!=0)
	{
		ICTCLAS_ParagraphProcess(sSentence,sSentenceResult);
		printf("%s\nInput string now!\n",sSentenceResult);
		scanf("%s",sSentence);
	}
    ICTCLAS_Exit();

	//Sample2: File segmentation and POS tagging
/* 
    ICTCLAS_Init();	
	ICTCLAS_FileProcess("E:\\Sample\\Corpus_NewPOS\\199802_Org.txt","E:\\Sample\\Corpus_NewPOS\\199802_Org_cla.txt");
	ICTCLAS_Exit();
*/

	//Sample3: Sentence segmentation and POS tagging with multiple result
/* 
    char sSentence[2000],**sSentenceResult;
	int i;
	sSentenceResult=new char*[5];
	for(i=0;i<5;i++)
		sSentenceResult[i]=new char[5000];		

	ICTCLAS_Init();
	printf("Input sentence now!\n");
	scanf("%s",sSentence);
	while(_stricmp(sSentence,"q")!=0)
	{
		ICTCLAS_SentenceProcess(sSentence,5,sSentenceResult);
		for(i=0;i<5;i++)
			printf("Result%d:%s\n",i+1,sSentenceResult[i]);
		printf("Input string now!\n");
		scanf("%s",sSentence);
	}
    ICTCLAS_Exit();
	
	for(i=0;i<5;i++)
		delete[] sSentenceResult[i];		
	delete [] sSentenceResult;
*/
	return 0;
}
예제 #6
0
void KeyExtract(const char *sInput)
{
	//初始化分词组件
	if(!ICTCLAS_Init())
	{
		printf("ICTCLAS INIT FAILED!\n");
		return ;
	}

    int nCount = ICTCLAS_GetParagraphProcessAWordCount(sInput);
	//分词。提取关键词
	result_t *result =(result_t*)malloc(sizeof(result_t)*nCount);
	ICTCLAS_ParagraphProcessAW(nCount,result);//获取结果存到客户的内存中

	//指纹提取,须在ICTCLAS_ParagraphProcessAW函数执行完后执行
	unsigned long lFinger = ICTCLAS_FingerPrint();

	char buf[100];
	memset(buf, 0, 100);
	sprintf(buf, "%x", lFinger);
	printf("%s\n", buf);

	//关键词提取,须在ICTCLAS_ParagraphProcessAW函数执行完后执行
	result_t *resultKey = (result_t*)malloc(sizeof(result_t)*nCount);
	int nCountKey;
	ICTCLAS_KeyWord(resultKey, nCountKey);

	for (int i=0; i<nCountKey; i++)
	{
		char buf[100];
		memset(buf, 0, 100);
		int index = resultKey[i].start;
        if( resultKey[i].weight>0)
        {
            memcpy(buf,(void *)(sInput+index), resultKey[i].length);
            printf("%s\t%d\n", buf, resultKey[i].weight);
        }
	}

	free(resultKey);
	free(result);
    
	//释放分词组件资源
	ICTCLAS_Exit();
}
예제 #7
0
int main(int argc, char** argv)
{
	//testNewWord(GBK_CODE);
	//testNewWord(UTF8_CODE);
    //初始化分词组件
    char* inputFile = argv[1];
    char* outputFile = argv[2];
    printf("%s\n", inputFile);
    printf("%s\n", outputFile);

    if(!ICTCLAS_Init("..",UTF8_CODE))//数据在当前路径下,设置为UTF8编码的分词
    {
        printf("ICTCLAS INIT FAILED!\n");
        return NULL;
    }
   	ICTCLAS_FileProcess(inputFile, outputFile, 0);
  	ICTCLAS_Exit();
    return 0;
}
예제 #8
0
int main(int argc, char **argv)
{
	int c;
	/* 默认参数设置 */
	char *httpcws_settings_listen = "0.0.0.0";
	int httpcws_settings_port = 1985;
	char *httpcws_settings_datapath = NULL; /*中文词典数据库路径 */
	bool httpcws_settings_daemon = false;
	int httpcws_settings_timeout = 120; /* 单位:秒 */

    /* process arguments */
    while ((c = getopt(argc, argv, "l:p:x:t:dh")) != -1) {
        switch (c) {
        case 'l':
            httpcws_settings_listen = strdup(optarg);
            break;
        case 'p':
            httpcws_settings_port = atoi(optarg);
            break;
        case 'x':
            httpcws_settings_datapath = strdup(optarg); /* 词库文件存储路径 */
            break;
        case 't':
            httpcws_settings_timeout = atoi(optarg);
            break;			
        case 'd':
            httpcws_settings_daemon = true;
            break;
		case 'h':			
        default:
            show_help();
            return 1;
        }
    }
	
	/* 判断是否加了必填参数 -x */
	if (httpcws_settings_datapath == NULL) {
		show_help();
		fprintf(stderr, "Attention: Please use the indispensable argument: -x <path>\n\n");		
		exit(1);
	}

	/* 初始化分词组件 */
	if(!ICTCLAS_Init(httpcws_settings_datapath))
	{
		printf("%s\n", httpcws_settings_datapath);
		fprintf(stderr, "ERROR: Count not open the Chinese dictionary!\n\n");		
		exit(1);
	}
	ICTCLAS_SetPOSmap(ICT_POS_MAP_SECOND);

	fprintf(stderr, "Loading Chinese dictionary 'httpcws_dict.txt' into memory, please waitting ......\n");
	char *httpcws_settings_dataname = (char *)malloc(1024);
	memset (httpcws_settings_dataname, '\0', 1024);
	sprintf(httpcws_settings_dataname, "%s/httpcws_dict.txt", httpcws_settings_datapath);
	int nCount = ICTCLAS_ImportUserDict(httpcws_settings_dataname);
	ICTCLAS_SaveTheUsrDic();
	free(httpcws_settings_dataname);
	printf("OK! %d words has loaded into memory.\n\n", nCount);
	printf("HTTPCWS Server running on %s:%d\n", httpcws_settings_listen, httpcws_settings_port);

	/* 如果加了-d参数,以守护进程运行 */
	if (httpcws_settings_daemon == true){
        pid_t pid;

        /* Fork off the parent process */       
        pid = fork();
        if (pid < 0) {
                exit(EXIT_FAILURE);
        }
        /* If we got a good PID, then
           we can exit the parent process. */
        if (pid > 0) {
                exit(EXIT_SUCCESS);
        }
	}
	
	/* 请求处理部分 */
	struct evhttp *httpd;

	event_init();
	httpd = evhttp_start(httpcws_settings_listen, httpcws_settings_port);
	evhttp_set_timeout(httpd, httpcws_settings_timeout);

	/* Set a callback for all other requests. */
	evhttp_set_gencb(httpd, httpcws_handler, NULL);

	event_dispatch();

	/* Not reached in this code as it is now. */
	evhttp_free(httpd);

    return 0;
}
예제 #9
0
void SplitGBK(const char *sInput)
{//分词演示

	//初始化分词组件
	if(!ICTCLAS_Init())//数据在当前路径下,默认为GBK编码的分词
	{
		printf("ICTCLAS INIT FAILED!\n");
		return ;
	}

	ICTCLAS_SetPOSmap(ICT_POS_MAP_SECOND);

	char sSentence[2000]="三枪拍案惊奇的主创人员包括孙红雷、小沈阳、闫妮等,导演为张艺谋";
	const char * sResult;

	int nCount;
	ICTCLAS_ParagraphProcessA(sSentence,&nCount);
	printf("nCount=%d\n",nCount);

	ICTCLAS_AddUserWord("孙红雷 yym");//添加孙红雷,作为演员名称
	sResult = ICTCLAS_ParagraphProcess(sSentence,1);
	printf("%s\n", sResult);
	ICTCLAS_AddUserWord("小沈阳 yym");//添加小沈阳,作为演员名称
	sResult = ICTCLAS_ParagraphProcess(sSentence,1);
	printf("%s\n", sResult);
	ICTCLAS_AddUserWord("闫妮 yym");//添加闫妮,作为演员名称
	sResult = ICTCLAS_ParagraphProcess(sSentence,1);
	printf("%s\n", sResult);
	ICTCLAS_AddUserWord("三枪拍案惊奇 dym");//添加三枪拍案惊奇,作为电影名称
	sResult = ICTCLAS_ParagraphProcess(sSentence,1);
	printf("%s\n", sResult);
	

	while(_stricmp(sSentence,"q")!=0)
	{
		sResult = ICTCLAS_ParagraphProcess(sSentence,0);
		printf("%s\nInput string now('q' to quit)!\n", sResult);
		scanf("%s",sSentence);
	}
	
	//导入用户词典前
	printf("未导入用户词典:\n");
	sResult = ICTCLAS_ParagraphProcess(sInput, 0);
	printf("%s\n", sResult);

	//导入用户词典后
	printf("\n导入用户词典后:\n");
	nCount = ICTCLAS_ImportUserDict("userdic.txt");//userdic.txt覆盖以前的用户词典
	//保存用户词典
	ICTCLAS_SaveTheUsrDic();
	printf("导入%d个用户词。\n", nCount);
	
	sResult = ICTCLAS_ParagraphProcess(sInput, 1);
	printf("%s\n", sResult);

	//动态添加用户词
	printf("\n动态添加用户词后:\n");
	ICTCLAS_AddUserWord("计算机学院   xueyuan");
	ICTCLAS_SaveTheUsrDic();
	sResult = ICTCLAS_ParagraphProcess(sInput, 1);
	printf("%s\n", sResult);


	//对文件进行分词
	ICTCLAS_FileProcess("testGBK.txt","testGBK_result.txt",1);


	//释放分词组件资源
	ICTCLAS_Exit();
}