int API_DOC2VEC::doc2vec( vector< string > vecQuaryWord, //[In]quary word map< vector< string >, vector< float > > &mapQuaryDoc ) //[Out]quary Doc { if ( vecQuaryWord.size() == 0 ) { printf( "API_DOC2VEC::doc2vec err!!vecQuaryWord.size:%ld, do not find KeyWord!!\n", vecQuaryWord.size() ); return -1; } /*****************************Init:Commen*****************************/ int nRet = 0; string word; long long i, j, Count; /*****************************Init*****************************/ vector<string> outQueryWords; vector<string> queryCutWords; map< string, vector< float > > mapQuaryWord; //[In]quary word map< string, float > mapQuaryTf; //[In]quary TF map< string,int >::iterator itStopwords; map< string, float >::iterator itQuaryTf; //[In]dic word map< string, vector< float > >::iterator itDicWord; //[In]dic word //doc2vec_cutword outQueryWords.clear(); for ( j=0;j<vecQuaryWord.size();j++ ) { queryCutWords.clear(); //Cut Word app.cut( vecQuaryWord[j], queryCutWords, METHOD_MIX ); //Check && Save GetFeat for ( i=0;i<queryCutWords.size();i++ ) { if ( (queryCutWords[i].empty()) || (queryCutWords[i] == " ") || (queryCutWords[i] == " ") || (api_commen.IsDigit2(queryCutWords[i])) || (queryCutWords[i].size()<2) ) { continue; } itStopwords = stopwords.find(queryCutWords[i]); if (itStopwords == stopwords.end()) // not find { outQueryWords.push_back( queryCutWords[i] ); } } } /* //check info printf("Cut Words:"); for(i=0;i<outQueryWords.size();i++) printf( "%s ", outQueryWords[i].c_str() ); printf( "\n" );*/ mapQuaryWord.clear(); //[In]quary word mapQuaryTf.clear(); //[In]quary TF //Count Quary TF by Quary Vector for (i = 0; i < outQueryWords.size(); i++) { word = outQueryWords[i]; //Quary Vector itDicWord = mapDicWord.find(word); if (itDicWord != mapDicWord.end()) // find it mapQuaryWord[word] = itDicWord->second; else continue; //Quary TF itQuaryTf = mapQuaryTf.find(word); if (itQuaryTf != mapQuaryTf.end()) // find it mapQuaryTf[word]++; else mapQuaryTf[word] = 1; } //Quary TF Count = mapQuaryTf.size(); if ( Count == 0 ) { printf( "idQuarySize:%ld, do not find KeyWord!!\n", outQueryWords.size() ); return -1; } for(itQuaryTf = mapQuaryTf.begin(); itQuaryTf != mapQuaryTf.end(); itQuaryTf++) { mapQuaryTf[itQuaryTf->first] = itQuaryTf->second*1.0/Count; } //word2doc mapQuaryDoc.clear(); nRet = doc2vec_word2doc( mapQuaryWord, mapQuaryTf, mapQuaryDoc ); if ( nRet != 0 ) { printf("doc2vec_train:word2doc err!!\n"); return nRet; } return nRet; }
/************************************************************************* > File Name: ./app.cpp > Author: > Mail: > Created Time: Mon 03 Aug 2015 10:13:19 AM CST ************************************************************************/ #include<iostream> int a = 10; std::string str("hello"); CppJieba::Application app("./dict/jieba.dict.utf8", "./dict/hmm_model.utf8", "./dict/user.dict.utf8", "./dict/idf.utf8", "./dict/stop_words.utf8"); using namespace std; void test() { app.cut(std::string, std::vector<std::string>) { } }