Exemple #1
0
void MainWindow::textBagOfWords(std::set<std::string> &featureDic, std::map<std::string, int> &text,int &textSize)
{
    if(!ICTCLAS_Init()) //初始化分词组件。
    {
            QMessageBox::warning(this,"Warnning","Init fails",QMessageBox::Yes);
            return;
    }
    else
    {
            printf("Init ok\n");
    }
    ICTCLAS_SetPOSmap(2);

    QString myStr = ui->text->toPlainText();
    QByteArray ba = myStr.toLocal8Bit();
    char* sText;
    char* sSentence = (char*)malloc(ba.size()+10);
    sText = ba.data();

    int len = 0;
    //这样读入为了将换行符去掉,ASCII码10、13
    for(int i = 0; sText[i] != '\0'; i++)
    {
        if(sText[i] == '\n' || sText[i] == '\r')continue;
        sSentence[len++] = sText[i];
    }
    sSentence[len] = '\0';

    unsigned int nPaLen=strlen(sSentence); // 需要分词的长度
    char* sRst=0;   //用户自行分配空间,用于保存结果;
    sRst=(char*)malloc(nPaLen*6); //建议长度为字符串长度的6倍。

    int nRstLen=0; //分词结果的长度

    nRstLen = ICTCLAS_ParagraphProcess(sSentence,nPaLen,sRst,CODE_TYPE_UNKNOWN,0);  //字符串处理

    //free(sText);
    free(sSentence);

    //收集单词,形成字典
    std::string words;
    std::istringstream istream(sRst);
    std::set<std::string> txtWords;
    while(istream>>words)
    {
        txtWords.insert(words);
        if(featureDic.count(words))
        {
            text[words]++;
        }
    }
    textSize = txtWords.size();
    free(sRst);
    txtWords.clear();
    istream.clear();
    ICTCLAS_Exit();	//释放资源退出
    return;

}
Exemple #2
0
int main(int argc, char **argv)
{
	int c;
	/* 默认参数设置 */
	char *httpcws_settings_listen = "0.0.0.0";
	int httpcws_settings_port = 1985;
	char *httpcws_settings_datapath = NULL; /*中文词典数据库路径 */
	bool httpcws_settings_daemon = false;
	int httpcws_settings_timeout = 120; /* 单位:秒 */

    /* process arguments */
    while ((c = getopt(argc, argv, "l:p:x:t:dh")) != -1) {
        switch (c) {
        case 'l':
            httpcws_settings_listen = strdup(optarg);
            break;
        case 'p':
            httpcws_settings_port = atoi(optarg);
            break;
        case 'x':
            httpcws_settings_datapath = strdup(optarg); /* 词库文件存储路径 */
            break;
        case 't':
            httpcws_settings_timeout = atoi(optarg);
            break;			
        case 'd':
            httpcws_settings_daemon = true;
            break;
		case 'h':			
        default:
            show_help();
            return 1;
        }
    }
	
	/* 判断是否加了必填参数 -x */
	if (httpcws_settings_datapath == NULL) {
		show_help();
		fprintf(stderr, "Attention: Please use the indispensable argument: -x <path>\n\n");		
		exit(1);
	}

	/* 初始化分词组件 */
	if(!ICTCLAS_Init(httpcws_settings_datapath))
	{
		printf("%s\n", httpcws_settings_datapath);
		fprintf(stderr, "ERROR: Count not open the Chinese dictionary!\n\n");		
		exit(1);
	}
	ICTCLAS_SetPOSmap(ICT_POS_MAP_SECOND);

	fprintf(stderr, "Loading Chinese dictionary 'httpcws_dict.txt' into memory, please waitting ......\n");
	char *httpcws_settings_dataname = (char *)malloc(1024);
	memset (httpcws_settings_dataname, '\0', 1024);
	sprintf(httpcws_settings_dataname, "%s/httpcws_dict.txt", httpcws_settings_datapath);
	int nCount = ICTCLAS_ImportUserDict(httpcws_settings_dataname);
	ICTCLAS_SaveTheUsrDic();
	free(httpcws_settings_dataname);
	printf("OK! %d words has loaded into memory.\n\n", nCount);
	printf("HTTPCWS Server running on %s:%d\n", httpcws_settings_listen, httpcws_settings_port);

	/* 如果加了-d参数,以守护进程运行 */
	if (httpcws_settings_daemon == true){
        pid_t pid;

        /* Fork off the parent process */       
        pid = fork();
        if (pid < 0) {
                exit(EXIT_FAILURE);
        }
        /* If we got a good PID, then
           we can exit the parent process. */
        if (pid > 0) {
                exit(EXIT_SUCCESS);
        }
	}
	
	/* 请求处理部分 */
	struct evhttp *httpd;

	event_init();
	httpd = evhttp_start(httpcws_settings_listen, httpcws_settings_port);
	evhttp_set_timeout(httpd, httpcws_settings_timeout);

	/* Set a callback for all other requests. */
	evhttp_set_gencb(httpd, httpcws_handler, NULL);

	event_dispatch();

	/* Not reached in this code as it is now. */
	evhttp_free(httpd);

    return 0;
}
Exemple #3
0
void SplitGBK(const char *sInput)
{//分词演示

	//初始化分词组件
	if(!ICTCLAS_Init())//数据在当前路径下,默认为GBK编码的分词
	{
		printf("ICTCLAS INIT FAILED!\n");
		return ;
	}

	ICTCLAS_SetPOSmap(ICT_POS_MAP_SECOND);

	char sSentence[2000]="三枪拍案惊奇的主创人员包括孙红雷、小沈阳、闫妮等,导演为张艺谋";
	const char * sResult;

	int nCount;
	ICTCLAS_ParagraphProcessA(sSentence,&nCount);
	printf("nCount=%d\n",nCount);

	ICTCLAS_AddUserWord("孙红雷 yym");//添加孙红雷,作为演员名称
	sResult = ICTCLAS_ParagraphProcess(sSentence,1);
	printf("%s\n", sResult);
	ICTCLAS_AddUserWord("小沈阳 yym");//添加小沈阳,作为演员名称
	sResult = ICTCLAS_ParagraphProcess(sSentence,1);
	printf("%s\n", sResult);
	ICTCLAS_AddUserWord("闫妮 yym");//添加闫妮,作为演员名称
	sResult = ICTCLAS_ParagraphProcess(sSentence,1);
	printf("%s\n", sResult);
	ICTCLAS_AddUserWord("三枪拍案惊奇 dym");//添加三枪拍案惊奇,作为电影名称
	sResult = ICTCLAS_ParagraphProcess(sSentence,1);
	printf("%s\n", sResult);
	

	while(_stricmp(sSentence,"q")!=0)
	{
		sResult = ICTCLAS_ParagraphProcess(sSentence,0);
		printf("%s\nInput string now('q' to quit)!\n", sResult);
		scanf("%s",sSentence);
	}
	
	//导入用户词典前
	printf("未导入用户词典:\n");
	sResult = ICTCLAS_ParagraphProcess(sInput, 0);
	printf("%s\n", sResult);

	//导入用户词典后
	printf("\n导入用户词典后:\n");
	nCount = ICTCLAS_ImportUserDict("userdic.txt");//userdic.txt覆盖以前的用户词典
	//保存用户词典
	ICTCLAS_SaveTheUsrDic();
	printf("导入%d个用户词。\n", nCount);
	
	sResult = ICTCLAS_ParagraphProcess(sInput, 1);
	printf("%s\n", sResult);

	//动态添加用户词
	printf("\n动态添加用户词后:\n");
	ICTCLAS_AddUserWord("计算机学院   xueyuan");
	ICTCLAS_SaveTheUsrDic();
	sResult = ICTCLAS_ParagraphProcess(sInput, 1);
	printf("%s\n", sResult);


	//对文件进行分词
	ICTCLAS_FileProcess("testGBK.txt","testGBK_result.txt",1);


	//释放分词组件资源
	ICTCLAS_Exit();
}