PGDLLEXPORT
Datum
ParagraphProcess(PG_FUNCTION_ARGS)
{
	//text	   *in = PG_GETARG_TEXT_P(0);//输入参数为文本类型,例如:巴拿马和美国都是国家地区,汉族是一个民族。
	char *in =(char *)PG_GETARG_POINTER(0);
	int len;
	char *s;//将输入参数中的字符串指向内容拷贝到此字符串指针	
	char *splitWords='\0';
	text *out;//输出结果为文本类型
	
	len=VARSIZE(in)-VARHDRSZ;
	s = (char *)palloc(len+1);
	memcpy(s,VARDATA(in),len);//拷贝输入字符串到临时字符串
	//*(s+len)='\0';//解决乱码的关键
	s[len]='\0';//设置字符串的结尾标准为\0。这是必须的不然会出现乱码。
	
	//splitWords=(char *)palloc(4*strlen(s)+1);//为出入结果分配足够的内存空间以便保存即将分析后的字符串
	
	//splitWords=(char *)malloc(strlen(in)+1);

	NLPIR_Init(sDataPath,1,0);//初始化分词器

	splitWords= (char *)NLPIR_ParagraphProcess(s,1);//调用第三方分词包API进行分词,返回为字符串指针.第二个参数代表标注集的数字,1表示显示计算所一级标注集,0表示不显示标准集	
	//NLPIR_Exit();//error
	out=cstring_to_text(splitWords);

	//pfree(splitWords);
	PG_RETURN_TEXT_P(out);//输出返回结果为text类型,这里将char类型转换为text类型
	//pfree(splitWords);
	//NLPIR_Exit();//ok
}
PGDLLEXPORT
Datum
chinese_tokenize(PG_FUNCTION_ARGS)
{
	/* error 分词结果的最后不正确,最后字符为乱码.原因不在分词tokenize而在于它前面的代码
	text *t=PG_GETARG_TEXT_P(0);//输入参数,类型为文本text
	text *result_t = (text*)palloc(6*VARSIZE(t));//输出结果为文本类型text

	result_t =(text*)tokenize((const char*)t);

	PG_RETURN_TEXT_P(result_t);
	*/
	//char *in =(char *)PG_GETARG_TEXT_P(0);
	int len;
	//char *s;//将输入参数中的原始字符串复制到这个s字符串指针中,记得为其分配内存空间并设置最后字符标准为\0
	 const char *splitWords;
	// size_t len2;
	
	char *in = PG_GETARG_CSTRING(0);//我是中国人
	// len2=VARSIZE(in)-VARHDRSZ+1;
	// char *in =(char *)PG_GETARG_POINTER(0);//arg1
 //    char *s= (char *)palloc(VARSIZE(in)-VARHDRSZ+1);
	//  char *s = (char *)malloc(len2);
//	char *s = (char *)malloc(strlen(in)+1);
	//char *in =(char *)PG_GETARG_TEXT_P(0);
	//char *in =(char *)PG_GETARG_POINTER(0);
	//len=VARSIZE(in)-VARHDRSZ;
	//s = (char *)palloc(len+1);
	//SET_VARSIZE(s, VARSIZE(in));
//	strncpy(s,in,strlen(in));error
//	memcpy(s,VARDATA(in),VARSIZE(in)-VARHDRSZ);//由src指向地址为起始地址的连续n个字节的数据复制到以destin指向地址为起始地址的空间内。与strcpy相比,memcpy并不是遇到'\0'就结束,而是一定会拷贝完n个字节。
	//(s,VARDATA(in),VARSIZE(in)-VARHDRSZ);
	// StrNCpy(s,in,strlen(in));
	//s=strndup2(in);
//	s[VARSIZE(in)-VARHDRSZ]='\0';//解决乱码的关键
//	*(s+strlen(in))='\0';
	//splitWords=(char *)palloc(4*strlen(s)+1);//为出入结果分配足够的内存空间以便保存即将分析后的字符串
	
	//splitWords=tokenize(s);
	NLPIR_Init(sDataPath,1,0);//初始化分词器

	splitWords =NLPIR_ParagraphProcess(in,0);//进行分词,返回为字符串指针	

	// pfree(in);
	//free(s);
	//s=NULL;

	// len=strlen(in);
	//PG_RETURN_TEXT_P(cstring_to_text(splitWords));
	//PG_RETURN_CSTRING(splitWords);
	  PG_RETURN_INT32(strlen(splitWords));//19?
	//PG_RETURN_INT32(strlen("我是中国人"));//10,一个汉字占两个字节,并且strlen不计算\0这个结尾的一个字节所占的空间
	//PG_RETRURN_INT32(strlen(in));//15?
	//PG_RETURN_INT32(150);//150
	// PG_RETURN_INT32(len);//15?
	// PG_RETURN_INT32(((int32) sizeof(int32)));

}
//PG_FUNCTION_INFO_V1(testprs_star);
//PGDLLEXPORT
Datum 
testprs_start(PG_FUNCTION_ARGS)
{
   
	char *in =(char *)PG_GETARG_POINTER(0);
	int len;
	char *s;//将输入参数中的字符串指向内容拷贝到此字符串指针	
	char *splitWords='\0';
	text *out;//输出结果为文本类型
	
	len=VARSIZE(in)-VARHDRSZ;
	  s = (char *)palloc(len+1);
	memcpy(s,VARDATA(in),len);//拷贝输入字符串到临时字符串
	//*(s+len)='\0';//解决乱码的关键
	s[len]='\0';//设置字符串的结尾标准为\0。这是必须的不然会出现乱码。
	
	//splitWords=(char *)palloc(4*strlen(s)+1);//为出入结果分配足够的内存空间以便保存即将分析后的字符串
	
	//splitWords=(char *)malloc(strlen(in)+1);

	NLPIR_Init(sDataPath,1,0);//初始化分词器

	splitWords= (char *)NLPIR_ParagraphProcess(s,1);//调用第三方分词包API进行分词,返回为字符串指针.第二个参数代表标注集的数字,1表示显示计算所一级标注集,0表示不显示标准集	
		
	ParserState *pst = (ParserState *) palloc0(sizeof(ParserState));//初始化解析器以便分配内存空间
	
	//pst->buffer =(char *)PG_GETARG_POINTER(0);
	//char *in =(char *)PG_GETARG_POINTER(0);
      pst->buffer = splitWords;
	
	
	
	
	//splitWords=(char *)palloc(4*strlen(s)+1);//为出入结果分配足够的内存空间以便保存即将分析后的字符串
	
	//splitWords=(char *)malloc(strlen(in)+1);

		
	//NLPIR_Exit();//error	
			
	//NLPIR_Init(sDataPath,0,0);//初始化分词器
	//pst -> s = scws;//初始化分词句柄
	//pst->buffer = (char *) PG_GETARG_POINTER(0);//第一个参数,需要分析的文本
	pst->sege = splitWords;
	pst->len = PG_GETARG_INT32(1);//注意参数的序号为0、1、2等
	

	//pst->sege=(char *)NLPIR_ParagraphProcess(pst->buffer,0);//进行分词,返回为字符串指针
	//(pst->buffer) = (pst->sege);

	//pst->len=strlen(pst->sege);
	pst->pos = 0;

	PG_RETURN_POINTER(pst);//return
	
	
}
Beispiel #4
0
static void init()
{
if(!NLPIR_Init(sDataPath,1,0))//数据在当前根路径下,默认为GBK编码的分词
	{
		//printf("ICTCLAS INIT FAILED!\n");
		//return;
		ereport(ERROR,
				(errcode(ERRCODE_INTERNAL_ERROR),
				 errmsg("Chinese Parser Lib SCWS could not init!\"%s\"",""
				       )));
	}
}
Beispiel #5
0
NM::CDiction::CDiction()
{
    CConfig* pconfig = NM::CConfig::getInstance();
    pconfig->getconfigInfo("textquery","distdir",m_distdir);//yuliao dir
    pconfig->getconfigInfo("textquery","datadir",m_datadir);//fenci dir
    pconfig->getconfigInfo("textquery","stopdir",m_stopdir);//tingliuci dir
    pconfig->getconfigInfo("textquery","wordfreqpath",m_wordfreqpath);
    if(!NLPIR_Init(m_datadir.c_str(),UTF8_CODE))
    {
        std::cout<<"NLPLR Init Fail!"<<std::endl;
        exit(1);
    }
}
const struct  result_tokenize_s *tokenize_v2(const char *intext)
{
	
	struct result_tokenize_s *pVecResult;
//char *sResult=NULL;如果保存结果的常量在这里会出错.返回的结果最后作为全局变量放到函数体外
NLPIR_Init(sDataPath,1,0);//初始化分词函数

(pVecResult->splitWords_s)=(char *)NLPIR_ParagraphProcess(intext,0);//直接这样导入intext参数会导致分词结果出错,或乱码.

// =strlen(sResult);
(pVecResult->splitWords_len)=Int32GetDatum(strlen(pVecResult->splitWords_s));

return pVecResult;//测试表明,错不在上面这个分词函数

}
/* bak tokenize
const char *tokenize(const char *intext)
{
//const char *sResult=NULL;
handle=NLPIR_Init(sDataPath,1,0);
if(!handle)//数据在当前根路径下,默认为GBK编码的分词,即encode=0,若encode=1则为utf8.postgresql默认是utf8编码.
	{
		//printf("ICTCLAS INIT FAILED!\n");
		//return;
		ereport(ERROR,
				(errcode(ERRCODE_INTERNAL_ERROR),
				 errmsg("Chinese Parser Lib NLPIR SCWS could not init!\"%s\"",""
				       )));		
	}
else
//	printf("NLPIR DATA INIT OK!");
ereport(NOTICE,
				(errcode(ERRCODE_INTERNAL_ERROR),
				 errmsg("Chinese Parser Lib NLPIR lcp580 ok init!\"%s\"",""
				       )));

sResult=NLPIR_ParagraphProcess(intext,0);

return sResult;
}
*/
static const char  *
tokenize(const char *intext)
{
	
//char *sResult=NULL;如果保存结果的常量在这里会出错.返回的结果最后作为全局变量放到函数体外
NLPIR_Init(sDataPath,1,0);//初始化分词函数

sResult=NLPIR_ParagraphProcess(intext,0);//直接这样导入intext参数会导致分词结果出错,或乱码.

// =strlen(sResult);
//tokentext_length=strlen(sResult);

return sResult;//测试表明,错不在上面这个分词函数

}
Beispiel #8
0
int main(int argc,char* argv[])
{
    if(!NLPIR_Init(argv[1],UTF8_CODE))
    {
        printf("Init Fail!\n");
        exit(1);
    }
    char* pres;
    std::string line;
    while(std::cin >> line)
    {
        pres = (char*)NLPIR_ParagraphProcess(line.c_str(),0);
        std::cout<<pres<<std::endl;
    }
    NLPIR_Exit();
    return 0;
}
static const char *
ParagraphProcess2( char *intxt)
{
	char *in=NULL;//输入参数为文本类型
	int len;
	char *s;
	const char *splitWords;
	const char *out=NULL;

	len=VARSIZE(in)-VARHDRSZ;
	s = (char *)palloc(len+1);
	memcpy(s,VARDATA(in),len);
	*(s+len)='\0';//解决乱码的关键
	
	splitWords=(char *)palloc(4*strlen(s)+1);	
	NLPIR_Init(sDataPath,1,0);//初始化分词器

	splitWords=NLPIR_ParagraphProcess(s,0);//进行分词,返回为字符串指针
	
	out = splitWords;//将字符类型转换为文本类型
	return out;//输出为文本类型
}
Beispiel #10
0
const char *tokenize(char *intext){

handle=NLPIR_Init(sDataPath,1,0);
if(!handle)//数据在当前根路径下,默认为GBK编码的分词,即encode=0,若encode=1则为utf8.postgresql默认是utf8编码.
	{
		//printf("ICTCLAS INIT FAILED!\n");
		//return;
		ereport(ERROR,
				(errcode(ERRCODE_INTERNAL_ERROR),
				 errmsg("Chinese Parser Lib NLPIR SCWS could not init!\"%s\"",""
				       )));		
	}
else
//	printf("NLPIR DATA INIT OK!");
ereport(NOTICE,
				(errcode(ERRCODE_INTERNAL_ERROR),
				 errmsg("Chinese Parser Lib NLPIR lcp580 ok init!\"%s\"",""
				       )));

sResult=NLPIR_ParagraphProcess(intext,0);

return sResult;
}
PGDLLEXPORT
Datum
out_len(PG_FUNCTION_ARGS)
{
	
	char  *in_txt = (char *)PG_GETARG_TEXT_P(0);
	char *s;//临时存储复制后的输入字符串并以\0结尾
	int  len;
	char *out_txt;

	len = VARSIZE(in_txt)-VARHDRSZ;
	s = (char *)palloc(len+1);
	memcpy(s,VARDATA(in_txt),len);
	*(s+len)='\0';//解决乱码的关键	
	
	NLPIR_Init(sDataPath,1,0);//初始化分词器

	out_txt= (char *)NLPIR_ParagraphProcess(s,0);//进行分词,返回为字符串指针	

	//out_txt=(char *)DirectFunctionCall1(chinese_tokenize,PointerGetDatum(in_txt));
	len =strlen(out_txt);
	PG_RETURN_INT32(len);
	//PG_RETURN_INT32(Int32GetDatum(strlen(out_txt)));
}
Beispiel #12
0
int main(int argc, char* argv[])
{
	
	
	const char * sResult;
	if(!NLPIR_Init()) 
	{
		printf("Init fails\n");
		return -1;
	}

	//ofstream log;
	//log.open("F:\\VS8_Projects\\NLPIR\\log1.txt");

	clock_t start,end;

	map<string,string> patent  ;//存储IPC号和 标题+摘要字符串对
	vector<string> splitRes;//存储每一个字符串分词后的结果
	vector<string> stopWordFlags;//停用词集合标志			
	map<string,int> mpSplitResult;	//对每一个分词进行文档内的词频统计	
	map<string,struct freDocAttr> resMap;
	TermFreqDocs t;
	CPostgre pg;

	t.generateStopWordFlags(stopWordFlags);//生成停用词集合

	start = clock();//程序执行初始时间标记
	
	
	string hostchinapatent = "dbname=chinapatent user=postgres password=123456 host=localhost port=5432";
	PGconn * conn = pg.connectDatabase(hostchinapatent.c_str()); //连接chinapatent数据库

	string hostsubclassterm = "dbname=subclass_term user=postgres password=123456 host=localhost port=5432";
	PGconn * connsubclassterm = pg.connectDatabase(hostsubclassterm.c_str()); //连接subclass_term数据库
	
	char * sql = "select ic1,ti,ab from chinapatent where ad >= '2000-01-01 00:00:00'::timestamp without time zone AND ad <= '2010-12-31 00:00:00'::timestamp without time zone \
				  and pa like '%华为%'; ";

	string sqlTemp = sql;
	pg.GBKToUTF8(sqlTemp);	

	PGresult *respg = PQexec(conn,sqlTemp.c_str());

	end = clock();
	cout<<"满足专利数目:"<<PQntuples(respg)<<"\n";
	cout<<"数据库查询耗时:"<<1000*(end-start)/CLOCKS_PER_SEC<<" ms\n";
	
	//打开存放结果文件
	int lineMaxSize = 50;
	ifstream file;
	const char * fileName = "log2.txt";
	file.open(fileName);	
	char *line = new char[lineMaxSize];
	struct freDocAttr fda;
	
	//map<string,struct freDocAttr> resMap;
	vector<string> resVec;
	//1、读取存储结果的文件
	while(file.getline(line,lineMaxSize))
	{
		string sline = line;
		if(sline.size() == 0) break;
		t.split(sline,",",resVec,stopWordFlags);
		
		fda.freq = atoi(resVec.at(1).c_str());
		fda.docs = atoi(resVec.at(2).c_str());
		fda.attr = atoi(resVec.at(3).c_str());

		resMap.insert( make_pair(resVec.at(0), fda) );

		resVec.clear();
	}
	file.close();	

	//++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


	start = clock();//程序执行初始时间标记
	if( PQresultStatus( respg ) != PGRES_TUPLES_OK)
	{
		cout<<PQerrorMessage(conn);		
		PQclear(respg);		
		
		return false;
	}
	int patentNum = PQntuples(respg);//查询到满足条件的元组数目
	
	for(int i = 0; i < patentNum; i++)
	{		
		string sPatent;
		pair<string,string> patentTemp;

		patentTemp.first = PQgetvalue(respg,i,0);
		patentTemp.second = PQgetvalue(respg,i,1);
		patentTemp.second +=  PQgetvalue(respg,i,2);
	
		t.UTF8ToGBK(patentTemp.second);		
		patent.insert(patentTemp);//获取满足条件的元组的第一列所有结果,ipc	
		
		
		if(i!=0 && i%10 == 0)//每获取10条专利就开始进行分词和存储,从而提高分词和存储的效率。经过试验10为最佳的长度
		{
			map<string, string>::iterator patent_begin = patent.begin();

			while( patent_begin != patent.end())
			{
				string strTemp;
				sResult = NLPIR_ParagraphProcess(patent_begin->second.c_str(),1);//分词后结果,为一个字符串形式	
				strTemp = sResult;//将分词结果char * 转换为 string 进行处理
								
				t.split(strTemp," ",splitRes,stopWordFlags);//对分词结果字符串进行切分,并过滤掉停用词,切分的标志是空格字符

				vector<string>::iterator splitRes_begin = splitRes.begin(),splitRes_end = splitRes.end();				
				t.getTermFre(splitRes_begin,splitRes_end,mpSplitResult);//对分词集合进行文档内的词频统计
				

				string tablename = patent_begin->first.substr(0,4);//取IPC的前4个字符,作为主题表名	
				
				map<string,int>::iterator mp_begin = mpSplitResult.begin();
				map<string,int>::iterator mp_end = mpSplitResult.end();
				t.getTermFreDocs(mp_begin, mp_end,resMap);

				splitRes.clear();//清空每条专利的分词
				mpSplitResult.clear();//清空每条专利的词频以及词频统计				
				
				patent_begin++;
			}
			patent.clear();//每处理10条专利就对向量集合进行清空
			
			
		}
	}	

	
	NLPIR_Exit();


	//3、将处理后的结果覆盖掉源文件;
	ofstream ofile;
	ofile.open(fileName,ios::trunc);
	
	map<string,struct freDocAttr>::iterator it = resMap.begin();
	while(it != resMap.end())
	{
		stringstream sTemp;	

		ofile<<it->first<<",";

		sTemp<<it->second.freq;
		ofile<<sTemp.str()<<",";
		sTemp.str("");
		

		sTemp<<it->second.docs;
		ofile<<sTemp.str()<<",";
		sTemp.str("");

		sTemp<<it->second.attr;
		ofile<<sTemp.str()<<"\n";
		sTemp.str("");

		it++;
	}
	
	ofile.close();

	//++++++++++++++++++++++++++++++++++++++++++++++++++++++

	end = clock();
	cout<<"总耗时耗时: "<<1000*(end-start)/CLOCKS_PER_SEC<<" ms\n";


	
	return 0;
	
}