PGDLLEXPORT Datum ParagraphProcess(PG_FUNCTION_ARGS) { //text *in = PG_GETARG_TEXT_P(0);//输入参数为文本类型,例如:巴拿马和美国都是国家地区,汉族是一个民族。 char *in =(char *)PG_GETARG_POINTER(0); int len; char *s;//将输入参数中的字符串指向内容拷贝到此字符串指针 char *splitWords='\0'; text *out;//输出结果为文本类型 len=VARSIZE(in)-VARHDRSZ; s = (char *)palloc(len+1); memcpy(s,VARDATA(in),len);//拷贝输入字符串到临时字符串 //*(s+len)='\0';//解决乱码的关键 s[len]='\0';//设置字符串的结尾标准为\0。这是必须的不然会出现乱码。 //splitWords=(char *)palloc(4*strlen(s)+1);//为出入结果分配足够的内存空间以便保存即将分析后的字符串 //splitWords=(char *)malloc(strlen(in)+1); NLPIR_Init(sDataPath,1,0);//初始化分词器 splitWords= (char *)NLPIR_ParagraphProcess(s,1);//调用第三方分词包API进行分词,返回为字符串指针.第二个参数代表标注集的数字,1表示显示计算所一级标注集,0表示不显示标准集 //NLPIR_Exit();//error out=cstring_to_text(splitWords); //pfree(splitWords); PG_RETURN_TEXT_P(out);//输出返回结果为text类型,这里将char类型转换为text类型 //pfree(splitWords); //NLPIR_Exit();//ok }
PGDLLEXPORT Datum chinese_tokenize(PG_FUNCTION_ARGS) { /* error 分词结果的最后不正确,最后字符为乱码.原因不在分词tokenize而在于它前面的代码 text *t=PG_GETARG_TEXT_P(0);//输入参数,类型为文本text text *result_t = (text*)palloc(6*VARSIZE(t));//输出结果为文本类型text result_t =(text*)tokenize((const char*)t); PG_RETURN_TEXT_P(result_t); */ //char *in =(char *)PG_GETARG_TEXT_P(0); int len; //char *s;//将输入参数中的原始字符串复制到这个s字符串指针中,记得为其分配内存空间并设置最后字符标准为\0 const char *splitWords; // size_t len2; char *in = PG_GETARG_CSTRING(0);//我是中国人 // len2=VARSIZE(in)-VARHDRSZ+1; // char *in =(char *)PG_GETARG_POINTER(0);//arg1 // char *s= (char *)palloc(VARSIZE(in)-VARHDRSZ+1); // char *s = (char *)malloc(len2); // char *s = (char *)malloc(strlen(in)+1); //char *in =(char *)PG_GETARG_TEXT_P(0); //char *in =(char *)PG_GETARG_POINTER(0); //len=VARSIZE(in)-VARHDRSZ; //s = (char *)palloc(len+1); //SET_VARSIZE(s, VARSIZE(in)); // strncpy(s,in,strlen(in));error // memcpy(s,VARDATA(in),VARSIZE(in)-VARHDRSZ);//由src指向地址为起始地址的连续n个字节的数据复制到以destin指向地址为起始地址的空间内。与strcpy相比,memcpy并不是遇到'\0'就结束,而是一定会拷贝完n个字节。 //(s,VARDATA(in),VARSIZE(in)-VARHDRSZ); // StrNCpy(s,in,strlen(in)); //s=strndup2(in); // s[VARSIZE(in)-VARHDRSZ]='\0';//解决乱码的关键 // *(s+strlen(in))='\0'; //splitWords=(char *)palloc(4*strlen(s)+1);//为出入结果分配足够的内存空间以便保存即将分析后的字符串 //splitWords=tokenize(s); NLPIR_Init(sDataPath,1,0);//初始化分词器 splitWords =NLPIR_ParagraphProcess(in,0);//进行分词,返回为字符串指针 // pfree(in); //free(s); //s=NULL; // len=strlen(in); //PG_RETURN_TEXT_P(cstring_to_text(splitWords)); //PG_RETURN_CSTRING(splitWords); PG_RETURN_INT32(strlen(splitWords));//19? //PG_RETURN_INT32(strlen("我是中国人"));//10,一个汉字占两个字节,并且strlen不计算\0这个结尾的一个字节所占的空间 //PG_RETRURN_INT32(strlen(in));//15? //PG_RETURN_INT32(150);//150 // PG_RETURN_INT32(len);//15? // PG_RETURN_INT32(((int32) sizeof(int32))); }
//PG_FUNCTION_INFO_V1(testprs_star); //PGDLLEXPORT Datum testprs_start(PG_FUNCTION_ARGS) { char *in =(char *)PG_GETARG_POINTER(0); int len; char *s;//将输入参数中的字符串指向内容拷贝到此字符串指针 char *splitWords='\0'; text *out;//输出结果为文本类型 len=VARSIZE(in)-VARHDRSZ; s = (char *)palloc(len+1); memcpy(s,VARDATA(in),len);//拷贝输入字符串到临时字符串 //*(s+len)='\0';//解决乱码的关键 s[len]='\0';//设置字符串的结尾标准为\0。这是必须的不然会出现乱码。 //splitWords=(char *)palloc(4*strlen(s)+1);//为出入结果分配足够的内存空间以便保存即将分析后的字符串 //splitWords=(char *)malloc(strlen(in)+1); NLPIR_Init(sDataPath,1,0);//初始化分词器 splitWords= (char *)NLPIR_ParagraphProcess(s,1);//调用第三方分词包API进行分词,返回为字符串指针.第二个参数代表标注集的数字,1表示显示计算所一级标注集,0表示不显示标准集 ParserState *pst = (ParserState *) palloc0(sizeof(ParserState));//初始化解析器以便分配内存空间 //pst->buffer =(char *)PG_GETARG_POINTER(0); //char *in =(char *)PG_GETARG_POINTER(0); pst->buffer = splitWords; //splitWords=(char *)palloc(4*strlen(s)+1);//为出入结果分配足够的内存空间以便保存即将分析后的字符串 //splitWords=(char *)malloc(strlen(in)+1); //NLPIR_Exit();//error //NLPIR_Init(sDataPath,0,0);//初始化分词器 //pst -> s = scws;//初始化分词句柄 //pst->buffer = (char *) PG_GETARG_POINTER(0);//第一个参数,需要分析的文本 pst->sege = splitWords; pst->len = PG_GETARG_INT32(1);//注意参数的序号为0、1、2等 //pst->sege=(char *)NLPIR_ParagraphProcess(pst->buffer,0);//进行分词,返回为字符串指针 //(pst->buffer) = (pst->sege); //pst->len=strlen(pst->sege); pst->pos = 0; PG_RETURN_POINTER(pst);//return }
static void init() { if(!NLPIR_Init(sDataPath,1,0))//数据在当前根路径下,默认为GBK编码的分词 { //printf("ICTCLAS INIT FAILED!\n"); //return; ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Chinese Parser Lib SCWS could not init!\"%s\"","" ))); } }
NM::CDiction::CDiction() { CConfig* pconfig = NM::CConfig::getInstance(); pconfig->getconfigInfo("textquery","distdir",m_distdir);//yuliao dir pconfig->getconfigInfo("textquery","datadir",m_datadir);//fenci dir pconfig->getconfigInfo("textquery","stopdir",m_stopdir);//tingliuci dir pconfig->getconfigInfo("textquery","wordfreqpath",m_wordfreqpath); if(!NLPIR_Init(m_datadir.c_str(),UTF8_CODE)) { std::cout<<"NLPLR Init Fail!"<<std::endl; exit(1); } }
const struct result_tokenize_s *tokenize_v2(const char *intext) { struct result_tokenize_s *pVecResult; //char *sResult=NULL;如果保存结果的常量在这里会出错.返回的结果最后作为全局变量放到函数体外 NLPIR_Init(sDataPath,1,0);//初始化分词函数 (pVecResult->splitWords_s)=(char *)NLPIR_ParagraphProcess(intext,0);//直接这样导入intext参数会导致分词结果出错,或乱码. // =strlen(sResult); (pVecResult->splitWords_len)=Int32GetDatum(strlen(pVecResult->splitWords_s)); return pVecResult;//测试表明,错不在上面这个分词函数 }
/* bak tokenize const char *tokenize(const char *intext) { //const char *sResult=NULL; handle=NLPIR_Init(sDataPath,1,0); if(!handle)//数据在当前根路径下,默认为GBK编码的分词,即encode=0,若encode=1则为utf8.postgresql默认是utf8编码. { //printf("ICTCLAS INIT FAILED!\n"); //return; ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Chinese Parser Lib NLPIR SCWS could not init!\"%s\"","" ))); } else // printf("NLPIR DATA INIT OK!"); ereport(NOTICE, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Chinese Parser Lib NLPIR lcp580 ok init!\"%s\"","" ))); sResult=NLPIR_ParagraphProcess(intext,0); return sResult; } */ static const char * tokenize(const char *intext) { //char *sResult=NULL;如果保存结果的常量在这里会出错.返回的结果最后作为全局变量放到函数体外 NLPIR_Init(sDataPath,1,0);//初始化分词函数 sResult=NLPIR_ParagraphProcess(intext,0);//直接这样导入intext参数会导致分词结果出错,或乱码. // =strlen(sResult); //tokentext_length=strlen(sResult); return sResult;//测试表明,错不在上面这个分词函数 }
int main(int argc,char* argv[]) { if(!NLPIR_Init(argv[1],UTF8_CODE)) { printf("Init Fail!\n"); exit(1); } char* pres; std::string line; while(std::cin >> line) { pres = (char*)NLPIR_ParagraphProcess(line.c_str(),0); std::cout<<pres<<std::endl; } NLPIR_Exit(); return 0; }
static const char * ParagraphProcess2( char *intxt) { char *in=NULL;//输入参数为文本类型 int len; char *s; const char *splitWords; const char *out=NULL; len=VARSIZE(in)-VARHDRSZ; s = (char *)palloc(len+1); memcpy(s,VARDATA(in),len); *(s+len)='\0';//解决乱码的关键 splitWords=(char *)palloc(4*strlen(s)+1); NLPIR_Init(sDataPath,1,0);//初始化分词器 splitWords=NLPIR_ParagraphProcess(s,0);//进行分词,返回为字符串指针 out = splitWords;//将字符类型转换为文本类型 return out;//输出为文本类型 }
const char *tokenize(char *intext){ handle=NLPIR_Init(sDataPath,1,0); if(!handle)//数据在当前根路径下,默认为GBK编码的分词,即encode=0,若encode=1则为utf8.postgresql默认是utf8编码. { //printf("ICTCLAS INIT FAILED!\n"); //return; ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Chinese Parser Lib NLPIR SCWS could not init!\"%s\"","" ))); } else // printf("NLPIR DATA INIT OK!"); ereport(NOTICE, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Chinese Parser Lib NLPIR lcp580 ok init!\"%s\"","" ))); sResult=NLPIR_ParagraphProcess(intext,0); return sResult; }
PGDLLEXPORT Datum out_len(PG_FUNCTION_ARGS) { char *in_txt = (char *)PG_GETARG_TEXT_P(0); char *s;//临时存储复制后的输入字符串并以\0结尾 int len; char *out_txt; len = VARSIZE(in_txt)-VARHDRSZ; s = (char *)palloc(len+1); memcpy(s,VARDATA(in_txt),len); *(s+len)='\0';//解决乱码的关键 NLPIR_Init(sDataPath,1,0);//初始化分词器 out_txt= (char *)NLPIR_ParagraphProcess(s,0);//进行分词,返回为字符串指针 //out_txt=(char *)DirectFunctionCall1(chinese_tokenize,PointerGetDatum(in_txt)); len =strlen(out_txt); PG_RETURN_INT32(len); //PG_RETURN_INT32(Int32GetDatum(strlen(out_txt))); }
int main(int argc, char* argv[]) { const char * sResult; if(!NLPIR_Init()) { printf("Init fails\n"); return -1; } //ofstream log; //log.open("F:\\VS8_Projects\\NLPIR\\log1.txt"); clock_t start,end; map<string,string> patent ;//存储IPC号和 标题+摘要字符串对 vector<string> splitRes;//存储每一个字符串分词后的结果 vector<string> stopWordFlags;//停用词集合标志 map<string,int> mpSplitResult; //对每一个分词进行文档内的词频统计 map<string,struct freDocAttr> resMap; TermFreqDocs t; CPostgre pg; t.generateStopWordFlags(stopWordFlags);//生成停用词集合 start = clock();//程序执行初始时间标记 string hostchinapatent = "dbname=chinapatent user=postgres password=123456 host=localhost port=5432"; PGconn * conn = pg.connectDatabase(hostchinapatent.c_str()); //连接chinapatent数据库 string hostsubclassterm = "dbname=subclass_term user=postgres password=123456 host=localhost port=5432"; PGconn * connsubclassterm = pg.connectDatabase(hostsubclassterm.c_str()); //连接subclass_term数据库 char * sql = "select ic1,ti,ab from chinapatent where ad >= '2000-01-01 00:00:00'::timestamp without time zone AND ad <= '2010-12-31 00:00:00'::timestamp without time zone \ and pa like '%华为%'; "; string sqlTemp = sql; pg.GBKToUTF8(sqlTemp); PGresult *respg = PQexec(conn,sqlTemp.c_str()); end = clock(); cout<<"满足专利数目:"<<PQntuples(respg)<<"\n"; cout<<"数据库查询耗时:"<<1000*(end-start)/CLOCKS_PER_SEC<<" ms\n"; //打开存放结果文件 int lineMaxSize = 50; ifstream file; const char * fileName = "log2.txt"; file.open(fileName); char *line = new char[lineMaxSize]; struct freDocAttr fda; //map<string,struct freDocAttr> resMap; vector<string> resVec; //1、读取存储结果的文件 while(file.getline(line,lineMaxSize)) { string sline = line; if(sline.size() == 0) break; t.split(sline,",",resVec,stopWordFlags); fda.freq = atoi(resVec.at(1).c_str()); fda.docs = atoi(resVec.at(2).c_str()); fda.attr = atoi(resVec.at(3).c_str()); resMap.insert( make_pair(resVec.at(0), fda) ); resVec.clear(); } file.close(); //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ start = clock();//程序执行初始时间标记 if( PQresultStatus( respg ) != PGRES_TUPLES_OK) { cout<<PQerrorMessage(conn); PQclear(respg); return false; } int patentNum = PQntuples(respg);//查询到满足条件的元组数目 for(int i = 0; i < patentNum; i++) { string sPatent; pair<string,string> patentTemp; patentTemp.first = PQgetvalue(respg,i,0); patentTemp.second = PQgetvalue(respg,i,1); patentTemp.second += PQgetvalue(respg,i,2); t.UTF8ToGBK(patentTemp.second); patent.insert(patentTemp);//获取满足条件的元组的第一列所有结果,ipc if(i!=0 && i%10 == 0)//每获取10条专利就开始进行分词和存储,从而提高分词和存储的效率。经过试验10为最佳的长度 { map<string, string>::iterator patent_begin = patent.begin(); while( patent_begin != patent.end()) { string strTemp; sResult = NLPIR_ParagraphProcess(patent_begin->second.c_str(),1);//分词后结果,为一个字符串形式 strTemp = sResult;//将分词结果char * 转换为 string 进行处理 t.split(strTemp," ",splitRes,stopWordFlags);//对分词结果字符串进行切分,并过滤掉停用词,切分的标志是空格字符 vector<string>::iterator splitRes_begin = splitRes.begin(),splitRes_end = splitRes.end(); t.getTermFre(splitRes_begin,splitRes_end,mpSplitResult);//对分词集合进行文档内的词频统计 string tablename = patent_begin->first.substr(0,4);//取IPC的前4个字符,作为主题表名 map<string,int>::iterator mp_begin = mpSplitResult.begin(); map<string,int>::iterator mp_end = mpSplitResult.end(); t.getTermFreDocs(mp_begin, mp_end,resMap); splitRes.clear();//清空每条专利的分词 mpSplitResult.clear();//清空每条专利的词频以及词频统计 patent_begin++; } patent.clear();//每处理10条专利就对向量集合进行清空 } } NLPIR_Exit(); //3、将处理后的结果覆盖掉源文件; ofstream ofile; ofile.open(fileName,ios::trunc); map<string,struct freDocAttr>::iterator it = resMap.begin(); while(it != resMap.end()) { stringstream sTemp; ofile<<it->first<<","; sTemp<<it->second.freq; ofile<<sTemp.str()<<","; sTemp.str(""); sTemp<<it->second.docs; ofile<<sTemp.str()<<","; sTemp.str(""); sTemp<<it->second.attr; ofile<<sTemp.str()<<"\n"; sTemp.str(""); it++; } ofile.close(); //++++++++++++++++++++++++++++++++++++++++++++++++++++++ end = clock(); cout<<"总耗时耗时: "<<1000*(end-start)/CLOCKS_PER_SEC<<" ms\n"; return 0; }