PGDLLEXPORT Datum ParagraphProcess(PG_FUNCTION_ARGS) { //text *in = PG_GETARG_TEXT_P(0);//输入参数为文本类型,例如:巴拿马和美国都是国家地区,汉族是一个民族。 char *in =(char *)PG_GETARG_POINTER(0); int len; char *s;//将输入参数中的字符串指向内容拷贝到此字符串指针 char *splitWords='\0'; text *out;//输出结果为文本类型 len=VARSIZE(in)-VARHDRSZ; s = (char *)palloc(len+1); memcpy(s,VARDATA(in),len);//拷贝输入字符串到临时字符串 //*(s+len)='\0';//解决乱码的关键 s[len]='\0';//设置字符串的结尾标准为\0。这是必须的不然会出现乱码。 //splitWords=(char *)palloc(4*strlen(s)+1);//为出入结果分配足够的内存空间以便保存即将分析后的字符串 //splitWords=(char *)malloc(strlen(in)+1); NLPIR_Init(sDataPath,1,0);//初始化分词器 splitWords= (char *)NLPIR_ParagraphProcess(s,1);//调用第三方分词包API进行分词,返回为字符串指针.第二个参数代表标注集的数字,1表示显示计算所一级标注集,0表示不显示标准集 //NLPIR_Exit();//error out=cstring_to_text(splitWords); //pfree(splitWords); PG_RETURN_TEXT_P(out);//输出返回结果为text类型,这里将char类型转换为text类型 //pfree(splitWords); //NLPIR_Exit();//ok }
PGDLLEXPORT Datum chinese_tokenize(PG_FUNCTION_ARGS) { /* error 分词结果的最后不正确,最后字符为乱码.原因不在分词tokenize而在于它前面的代码 text *t=PG_GETARG_TEXT_P(0);//输入参数,类型为文本text text *result_t = (text*)palloc(6*VARSIZE(t));//输出结果为文本类型text result_t =(text*)tokenize((const char*)t); PG_RETURN_TEXT_P(result_t); */ //char *in =(char *)PG_GETARG_TEXT_P(0); int len; //char *s;//将输入参数中的原始字符串复制到这个s字符串指针中,记得为其分配内存空间并设置最后字符标准为\0 const char *splitWords; // size_t len2; char *in = PG_GETARG_CSTRING(0);//我是中国人 // len2=VARSIZE(in)-VARHDRSZ+1; // char *in =(char *)PG_GETARG_POINTER(0);//arg1 // char *s= (char *)palloc(VARSIZE(in)-VARHDRSZ+1); // char *s = (char *)malloc(len2); // char *s = (char *)malloc(strlen(in)+1); //char *in =(char *)PG_GETARG_TEXT_P(0); //char *in =(char *)PG_GETARG_POINTER(0); //len=VARSIZE(in)-VARHDRSZ; //s = (char *)palloc(len+1); //SET_VARSIZE(s, VARSIZE(in)); // strncpy(s,in,strlen(in));error // memcpy(s,VARDATA(in),VARSIZE(in)-VARHDRSZ);//由src指向地址为起始地址的连续n个字节的数据复制到以destin指向地址为起始地址的空间内。与strcpy相比,memcpy并不是遇到'\0'就结束,而是一定会拷贝完n个字节。 //(s,VARDATA(in),VARSIZE(in)-VARHDRSZ); // StrNCpy(s,in,strlen(in)); //s=strndup2(in); // s[VARSIZE(in)-VARHDRSZ]='\0';//解决乱码的关键 // *(s+strlen(in))='\0'; //splitWords=(char *)palloc(4*strlen(s)+1);//为出入结果分配足够的内存空间以便保存即将分析后的字符串 //splitWords=tokenize(s); NLPIR_Init(sDataPath,1,0);//初始化分词器 splitWords =NLPIR_ParagraphProcess(in,0);//进行分词,返回为字符串指针 // pfree(in); //free(s); //s=NULL; // len=strlen(in); //PG_RETURN_TEXT_P(cstring_to_text(splitWords)); //PG_RETURN_CSTRING(splitWords); PG_RETURN_INT32(strlen(splitWords));//19? //PG_RETURN_INT32(strlen("我是中国人"));//10,一个汉字占两个字节,并且strlen不计算\0这个结尾的一个字节所占的空间 //PG_RETRURN_INT32(strlen(in));//15? //PG_RETURN_INT32(150);//150 // PG_RETURN_INT32(len);//15? // PG_RETURN_INT32(((int32) sizeof(int32))); }
//PG_FUNCTION_INFO_V1(testprs_star); //PGDLLEXPORT Datum testprs_start(PG_FUNCTION_ARGS) { char *in =(char *)PG_GETARG_POINTER(0); int len; char *s;//将输入参数中的字符串指向内容拷贝到此字符串指针 char *splitWords='\0'; text *out;//输出结果为文本类型 len=VARSIZE(in)-VARHDRSZ; s = (char *)palloc(len+1); memcpy(s,VARDATA(in),len);//拷贝输入字符串到临时字符串 //*(s+len)='\0';//解决乱码的关键 s[len]='\0';//设置字符串的结尾标准为\0。这是必须的不然会出现乱码。 //splitWords=(char *)palloc(4*strlen(s)+1);//为出入结果分配足够的内存空间以便保存即将分析后的字符串 //splitWords=(char *)malloc(strlen(in)+1); NLPIR_Init(sDataPath,1,0);//初始化分词器 splitWords= (char *)NLPIR_ParagraphProcess(s,1);//调用第三方分词包API进行分词,返回为字符串指针.第二个参数代表标注集的数字,1表示显示计算所一级标注集,0表示不显示标准集 ParserState *pst = (ParserState *) palloc0(sizeof(ParserState));//初始化解析器以便分配内存空间 //pst->buffer =(char *)PG_GETARG_POINTER(0); //char *in =(char *)PG_GETARG_POINTER(0); pst->buffer = splitWords; //splitWords=(char *)palloc(4*strlen(s)+1);//为出入结果分配足够的内存空间以便保存即将分析后的字符串 //splitWords=(char *)malloc(strlen(in)+1); //NLPIR_Exit();//error //NLPIR_Init(sDataPath,0,0);//初始化分词器 //pst -> s = scws;//初始化分词句柄 //pst->buffer = (char *) PG_GETARG_POINTER(0);//第一个参数,需要分析的文本 pst->sege = splitWords; pst->len = PG_GETARG_INT32(1);//注意参数的序号为0、1、2等 //pst->sege=(char *)NLPIR_ParagraphProcess(pst->buffer,0);//进行分词,返回为字符串指针 //(pst->buffer) = (pst->sege); //pst->len=strlen(pst->sege); pst->pos = 0; PG_RETURN_POINTER(pst);//return }
void NM::CDiction::getwordfreq_file(std::string& filepath) { std::cout<<"filepath"<<filepath<<std::endl; std::fstream fin; fin.open(filepath.c_str()); if(!fin.is_open()) { std::cout<<filepath<<"open failed"<<std::endl; exit(0); } std::string line; std::string partiline; std::stringstream sin; std::string partiword; std::string str_upper("ABCDEFGHIJKLMNOPQRSTUVWXYZ"); int pos; MY_LANGTRAN::CLang lang; char* utfStr = NULL; int ret = -1; while(std::getline(fin,line)) { if(line.size() == 0) continue; ret = lang.gbk2utf8(&utfStr,line.c_str()); if(ret == -1) continue; line = utfStr; lang.destroy(&utfStr); pos = 0; while( (pos = line.find_first_of(str_upper,pos)) != std::string::npos) { line[pos] = line[pos] + 'a' - 'A'; pos++; } //空格分为全角和半角空格,对于全角空格要变为半角空格 for(int i=0;i<line.size();i++) if(ispunct(line[i])) line[i]=' '; partiline = (char*)NLPIR_ParagraphProcess(line.c_str(),0); sin.str(partiline); sin.clear(); while(sin>>partiword) { if(m_set_stop.find(partiword) != m_set_stop.end()) continue; m_word_freq[partiword]++; } } fin.close(); }
const struct result_tokenize_s *tokenize_v2(const char *intext) { struct result_tokenize_s *pVecResult; //char *sResult=NULL;如果保存结果的常量在这里会出错.返回的结果最后作为全局变量放到函数体外 NLPIR_Init(sDataPath,1,0);//初始化分词函数 (pVecResult->splitWords_s)=(char *)NLPIR_ParagraphProcess(intext,0);//直接这样导入intext参数会导致分词结果出错,或乱码. // =strlen(sResult); (pVecResult->splitWords_len)=Int32GetDatum(strlen(pVecResult->splitWords_s)); return pVecResult;//测试表明,错不在上面这个分词函数 }
/* bak tokenize const char *tokenize(const char *intext) { //const char *sResult=NULL; handle=NLPIR_Init(sDataPath,1,0); if(!handle)//数据在当前根路径下,默认为GBK编码的分词,即encode=0,若encode=1则为utf8.postgresql默认是utf8编码. { //printf("ICTCLAS INIT FAILED!\n"); //return; ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Chinese Parser Lib NLPIR SCWS could not init!\"%s\"","" ))); } else // printf("NLPIR DATA INIT OK!"); ereport(NOTICE, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Chinese Parser Lib NLPIR lcp580 ok init!\"%s\"","" ))); sResult=NLPIR_ParagraphProcess(intext,0); return sResult; } */ static const char * tokenize(const char *intext) { //char *sResult=NULL;如果保存结果的常量在这里会出错.返回的结果最后作为全局变量放到函数体外 NLPIR_Init(sDataPath,1,0);//初始化分词函数 sResult=NLPIR_ParagraphProcess(intext,0);//直接这样导入intext参数会导致分词结果出错,或乱码. // =strlen(sResult); //tokentext_length=strlen(sResult); return sResult;//测试表明,错不在上面这个分词函数 }
int main(int argc,char* argv[]) { if(!NLPIR_Init(argv[1],UTF8_CODE)) { printf("Init Fail!\n"); exit(1); } char* pres; std::string line; while(std::cin >> line) { pres = (char*)NLPIR_ParagraphProcess(line.c_str(),0); std::cout<<pres<<std::endl; } NLPIR_Exit(); return 0; }
PGDLLEXPORT Datum SplitGBK(PG_FUNCTION_ARGS) {//分词演示 const char *sample1=PG_GETARG_CSTRING(0); const char *sResult1=PG_GETARG_CSTRING(0); //text *sResult1=PG_GETARG_TEXT_P(0); init(); //初始化分词组件 //NLPIR_SetPOSmap(PKU_POS_MAP_SECOND);//设置词性标注集合的类型,默认为计算所二级标注体系 //sample1 = "开心成长的烦恼学习深造中国证券市场"; sResult1=NLPIR_ParagraphProcess(sample1,1); printf("%s\n", sResult1); //PG_RETURN_TEXT_P(sResult1); PG_RETURN_CSTRING(sResult1); }
static const char * ParagraphProcess2( char *intxt) { char *in=NULL;//输入参数为文本类型 int len; char *s; const char *splitWords; const char *out=NULL; len=VARSIZE(in)-VARHDRSZ; s = (char *)palloc(len+1); memcpy(s,VARDATA(in),len); *(s+len)='\0';//解决乱码的关键 splitWords=(char *)palloc(4*strlen(s)+1); NLPIR_Init(sDataPath,1,0);//初始化分词器 splitWords=NLPIR_ParagraphProcess(s,0);//进行分词,返回为字符串指针 out = splitWords;//将字符类型转换为文本类型 return out;//输出为文本类型 }
const char *tokenize(char *intext){ handle=NLPIR_Init(sDataPath,1,0); if(!handle)//数据在当前根路径下,默认为GBK编码的分词,即encode=0,若encode=1则为utf8.postgresql默认是utf8编码. { //printf("ICTCLAS INIT FAILED!\n"); //return; ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Chinese Parser Lib NLPIR SCWS could not init!\"%s\"","" ))); } else // printf("NLPIR DATA INIT OK!"); ereport(NOTICE, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Chinese Parser Lib NLPIR lcp580 ok init!\"%s\"","" ))); sResult=NLPIR_ParagraphProcess(intext,0); return sResult; }
PGDLLEXPORT Datum out_len(PG_FUNCTION_ARGS) { char *in_txt = (char *)PG_GETARG_TEXT_P(0); char *s;//临时存储复制后的输入字符串并以\0结尾 int len; char *out_txt; len = VARSIZE(in_txt)-VARHDRSZ; s = (char *)palloc(len+1); memcpy(s,VARDATA(in_txt),len); *(s+len)='\0';//解决乱码的关键 NLPIR_Init(sDataPath,1,0);//初始化分词器 out_txt= (char *)NLPIR_ParagraphProcess(s,0);//进行分词,返回为字符串指针 //out_txt=(char *)DirectFunctionCall1(chinese_tokenize,PointerGetDatum(in_txt)); len =strlen(out_txt); PG_RETURN_INT32(len); //PG_RETURN_INT32(Int32GetDatum(strlen(out_txt))); }
var_4 TF_word_extractor::calc_weight(var_1* _doc_beg, var_1* _doc_end, var_u2 _factor) { assert(_doc_beg); assert(_doc_end); if (_doc_beg == _doc_end) { return RET_SECCEED; } var_1* text = _doc_beg; if (m_base_conf._self_cut) { *_doc_end = '\0'; text = (var_1*)NLPIR_ParagraphProcess(_doc_beg); assert(text); for (_doc_end = text; *_doc_end; ++_doc_end); } var_u8 word_id = 0u; var_u8 tag_id = 0u; var_1* word_beg = NULL; var_1* word_end = NULL; var_1* tag_beg = NULL; var_1* tag_end = NULL; var_1* pos = text; var_u4 space_len = 0u; var_1 word[MAX_KEYWORD_LEN + 1] = {""}; var_4 word_len = 0; var_f4 weight = 0.f; var_4 ret = 0; tagweight* tmp_TW = NULL; term_info_st* cur_feature = NULL; term_info_st** pre_feature = NULL; var_1 tag[100]; while (_doc_end > pos) { //过滤前导空白 while (_doc_end > pos) { space_len = len_start_space(pos); if (0 < space_len) { pos += space_len; } else { break; } } word_beg = pos; //识别词语 space_len = 0; while (_doc_end > pos) { if (0 > *pos) { if (' ' == *pos) // 此处不能是中文全角空格,中科院分词决定 { space_len = 1; break; } ++pos; if (_doc_end == pos) { break; } } else if (' ' == *pos) { break; } ++pos; } tag_end = pos; //识别词性 var_1* p = pos; for (; p != word_beg && '/' != *p; --p); if (p == word_beg) { continue; } word_end = p; tag_beg = p + 1; if (word_beg < word_end && tag_beg < tag_end) { word_len = tag_end - tag_beg; if (100 < word_len) { continue; } memcpy(tag, tag_beg, word_len); tag[word_len] = '\0'; tag_id = m_md5er.MD5Bits64((var_u1*)tag, word_len); word_len = word_end - word_beg; if (MAX_KEYWORD_LEN < word_len) { continue; } memcpy(word, word_beg, word_len); word[word_len] = '\0'; word_id = m_md5er.MD5Bits64((var_u1*)word, word_len); if (need_consider(word)) {// 处理该词语(也保留以英文字符开头的词语) //根据词性计算权重 var_4 tmp_TW = m_share_container->get_weight(tag_id); if (0 < tmp_TW) { weight = tmp_TW; weight *= _factor; if (m_features_size == m_features_capacity) { LOG_ERROR("文本过长", "特征词缓冲区不够用!!!"); break; } cur_feature = m_features_pointer + m_features_size; cur_feature->weight = weight; cur_feature->word_id = word_id; memcpy(cur_feature->word_str, word, word_len + 1); cur_feature->type = 0; ret = assign_word_type(cur_feature->type, tag); if (ret) { LOG_FAILE_CALL_RET("TF_word_extractor::calc_weight", "assign_word_type", ret); return RET_ERROR_INVALID_PARAM; } ret = m_feature_map.AddKey_FL(word_id, &cur_feature, (var_vd**)&pre_feature); if (0 > ret) { LOG_FAILE_CALL_RET("TF_word_extractor::calc_weight", "m_feature_map->AddKey_FL", ret); return RET_ERROR_INVALID_PARAM; } else if (1 == ret) {// 已存在 assert(*pre_feature); assert(0.f < (*pre_feature)->weight); assert((*pre_feature)->word_str[0]); (*pre_feature)->weight += weight; (*pre_feature)->type |= cur_feature->type; ++(*pre_feature)->tf; } else { cur_feature->tf = 1; ++m_features_size; } } } } else { assert(!"警告:无词语或无词性"); } pos += space_len; // 跳过当前空白字符 } return RET_SECCEED; }
int main(int argc, char* argv[]) { const char * sResult; if(!NLPIR_Init()) { printf("Init fails\n"); return -1; } //ofstream log; //log.open("F:\\VS8_Projects\\NLPIR\\log1.txt"); clock_t start,end; map<string,string> patent ;//存储IPC号和 标题+摘要字符串对 vector<string> splitRes;//存储每一个字符串分词后的结果 vector<string> stopWordFlags;//停用词集合标志 map<string,int> mpSplitResult; //对每一个分词进行文档内的词频统计 map<string,struct freDocAttr> resMap; TermFreqDocs t; CPostgre pg; t.generateStopWordFlags(stopWordFlags);//生成停用词集合 start = clock();//程序执行初始时间标记 string hostchinapatent = "dbname=chinapatent user=postgres password=123456 host=localhost port=5432"; PGconn * conn = pg.connectDatabase(hostchinapatent.c_str()); //连接chinapatent数据库 string hostsubclassterm = "dbname=subclass_term user=postgres password=123456 host=localhost port=5432"; PGconn * connsubclassterm = pg.connectDatabase(hostsubclassterm.c_str()); //连接subclass_term数据库 char * sql = "select ic1,ti,ab from chinapatent where ad >= '2000-01-01 00:00:00'::timestamp without time zone AND ad <= '2010-12-31 00:00:00'::timestamp without time zone \ and pa like '%华为%'; "; string sqlTemp = sql; pg.GBKToUTF8(sqlTemp); PGresult *respg = PQexec(conn,sqlTemp.c_str()); end = clock(); cout<<"满足专利数目:"<<PQntuples(respg)<<"\n"; cout<<"数据库查询耗时:"<<1000*(end-start)/CLOCKS_PER_SEC<<" ms\n"; //打开存放结果文件 int lineMaxSize = 50; ifstream file; const char * fileName = "log2.txt"; file.open(fileName); char *line = new char[lineMaxSize]; struct freDocAttr fda; //map<string,struct freDocAttr> resMap; vector<string> resVec; //1、读取存储结果的文件 while(file.getline(line,lineMaxSize)) { string sline = line; if(sline.size() == 0) break; t.split(sline,",",resVec,stopWordFlags); fda.freq = atoi(resVec.at(1).c_str()); fda.docs = atoi(resVec.at(2).c_str()); fda.attr = atoi(resVec.at(3).c_str()); resMap.insert( make_pair(resVec.at(0), fda) ); resVec.clear(); } file.close(); //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ start = clock();//程序执行初始时间标记 if( PQresultStatus( respg ) != PGRES_TUPLES_OK) { cout<<PQerrorMessage(conn); PQclear(respg); return false; } int patentNum = PQntuples(respg);//查询到满足条件的元组数目 for(int i = 0; i < patentNum; i++) { string sPatent; pair<string,string> patentTemp; patentTemp.first = PQgetvalue(respg,i,0); patentTemp.second = PQgetvalue(respg,i,1); patentTemp.second += PQgetvalue(respg,i,2); t.UTF8ToGBK(patentTemp.second); patent.insert(patentTemp);//获取满足条件的元组的第一列所有结果,ipc if(i!=0 && i%10 == 0)//每获取10条专利就开始进行分词和存储,从而提高分词和存储的效率。经过试验10为最佳的长度 { map<string, string>::iterator patent_begin = patent.begin(); while( patent_begin != patent.end()) { string strTemp; sResult = NLPIR_ParagraphProcess(patent_begin->second.c_str(),1);//分词后结果,为一个字符串形式 strTemp = sResult;//将分词结果char * 转换为 string 进行处理 t.split(strTemp," ",splitRes,stopWordFlags);//对分词结果字符串进行切分,并过滤掉停用词,切分的标志是空格字符 vector<string>::iterator splitRes_begin = splitRes.begin(),splitRes_end = splitRes.end(); t.getTermFre(splitRes_begin,splitRes_end,mpSplitResult);//对分词集合进行文档内的词频统计 string tablename = patent_begin->first.substr(0,4);//取IPC的前4个字符,作为主题表名 map<string,int>::iterator mp_begin = mpSplitResult.begin(); map<string,int>::iterator mp_end = mpSplitResult.end(); t.getTermFreDocs(mp_begin, mp_end,resMap); splitRes.clear();//清空每条专利的分词 mpSplitResult.clear();//清空每条专利的词频以及词频统计 patent_begin++; } patent.clear();//每处理10条专利就对向量集合进行清空 } } NLPIR_Exit(); //3、将处理后的结果覆盖掉源文件; ofstream ofile; ofile.open(fileName,ios::trunc); map<string,struct freDocAttr>::iterator it = resMap.begin(); while(it != resMap.end()) { stringstream sTemp; ofile<<it->first<<","; sTemp<<it->second.freq; ofile<<sTemp.str()<<","; sTemp.str(""); sTemp<<it->second.docs; ofile<<sTemp.str()<<","; sTemp.str(""); sTemp<<it->second.attr; ofile<<sTemp.str()<<"\n"; sTemp.str(""); it++; } ofile.close(); //++++++++++++++++++++++++++++++++++++++++++++++++++++++ end = clock(); cout<<"总耗时耗时: "<<1000*(end-start)/CLOCKS_PER_SEC<<" ms\n"; return 0; }