int MetalabelFeature::ExtractFeature(const vector<TokenCitation*>& tokenVector, CitationSet& citationSet, UniGramFeature& uniGrams, BiGramFeature& biGrams, JournalSet& journalSet, FeatureSet& allFeatures, int printLog) { int rtn = 0; allFeatures.mFeatures.clear(); allFeatures.mMaxIndex = uniGrams.mDictionary.rbegin()->first + 1; allFeatures.mFeatures.resize(tokenVector.size()); FeatureSet biFeatures; biFeatures.mMaxIndex = biGrams.mDictionary.rbegin()->first + 1; biFeatures.mFeatures.resize(tokenVector.size()); FeatureSet jourFeatures; jourFeatures.mMaxIndex = journalSet.mJournals.rbegin()->first + 1; jourFeatures.mFeatures.resize(tokenVector.size()); int numThreads = omp_get_num_procs(); if (printLog != SILENT) clog << "CPU number: " << numThreads << endl; omp_set_num_threads(numThreads); if (printLog != SILENT) clog << "Start Parallel Extract Features" << endl; #pragma omp parallel for schedule(dynamic) for (int i = 0; i < tokenVector.size(); i++) { uniGrams.Extract(*tokenVector[i], allFeatures.mFeatures[i]); } #pragma omp parallel for schedule(dynamic) for (int i = 0; i < tokenVector.size(); i++) { biGrams.Extract(*tokenVector[i], biFeatures.mFeatures[i]); } #pragma omp parallel for schedule(dynamic) for (int i = 0; i < tokenVector.size(); ++i) { Journal* ptrJournal = NULL; ptrJournal = journalSet.SearchJournalTitle(citationSet[tokenVector[i]->mPmid]->mJournalTitle); if (ptrJournal != NULL) { jourFeatures.mFeatures[i][ptrJournal->mJournalId] = 1.0; } else { cerr << "Error: can't find \"" << citationSet[tokenVector[i]->mPmid]->mJournalTitle << " in pmid " << tokenVector[i]->mPmid << endl; } } rtn = allFeatures.Merge(biFeatures); CHECK_RTN(rtn); rtn = allFeatures.Merge(jourFeatures); CHECK_RTN(rtn); rtn = allFeatures.Normalize(); CHECK_RTN(rtn); return 0; }
int MetalabelFeature::ExtractFeature(const vector<TokenCitation*>& tokenVector, UniGramFeature& uniGrams, BiGramFeature& biGrams, feature_node** &featureSpace, int printLog) { int numThreads = omp_get_num_procs(); if (printLog != SILENT) clog << "CPU number: " << numThreads << endl; omp_set_num_threads(numThreads); if (printLog != SILENT) clog << "Extract unigram & bigram" << endl; if (printLog != SILENT) clog << "Make Feature table" << endl; int featureNum = (int)tokenVector.size(); featureSpace = NULL; featureSpace = Malloc(feature_node*, featureNum); memset(featureSpace, 0, sizeof(feature_node*)* featureNum); int uniMaxIndex = uniGrams.mDictionary.rbegin()->first + 1; int biMaxIndex = biGrams.mDictionary.rbegin()->first + 1; if (printLog != SILENT) clog << "Extract features parallel" << endl; #pragma omp parallel for schedule(dynamic) for (int i = 0; i < (int)tokenVector.size(); i++) { FeatureSet tabAllFeatures; tabAllFeatures.mMaxIndex = uniMaxIndex; tabAllFeatures.mFeatures.resize(1); FeatureSet tabBiFeatures; tabBiFeatures.mMaxIndex = biMaxIndex; tabBiFeatures.mFeatures.resize(1); uniGrams.Extract(*tokenVector[i], tabAllFeatures.mFeatures[0]); biGrams.Extract(*tokenVector[i], tabBiFeatures.mFeatures[0]); tabAllFeatures.Merge(tabBiFeatures); tabAllFeatures.Normalize(); featureSpace[i] = NULL; LinearMachine::TransFeatures(featureSpace[i], tabAllFeatures.mFeatures[0]); } return 0; }
int MetalabelFeature::ExtractFeature(const std::vector<TokenCitation*>& tokenVector, UniGramFeature& uniGrams, FeatureSet& allFeatures, int printLog) { int rtn = 0; allFeatures.mFeatures.clear(); allFeatures.mMaxIndex = uniGrams.mDictionary.rbegin()->first + 1; allFeatures.mFeatures.resize(tokenVector.size()); int numThreads = omp_get_num_procs(); if (printLog != SILENT) clog << "CPU number: " << numThreads << endl; omp_set_num_threads(numThreads); if (printLog != SILENT) clog << "Start Parallel Extract Features" << endl; #pragma omp parallel for schedule(dynamic) for (int i = 0; i < tokenVector.size(); i++) { uniGrams.Extract(*tokenVector[i], allFeatures.mFeatures[i]); } rtn = allFeatures.Normalize(); CHECK_RTN(rtn); return 0; }
int MetalabelFeature::ExtractFeature(const vector<TokenCitation*>& tokenVector, CitationSet& citationSet, UniGramFeature& uniGrams, BiGramFeature& biGrams, JournalSet& journalSet, feature_node** &featureSpace, int printLog) { int numThreads = omp_get_num_procs(); if (printLog != SILENT) clog << "CPU number: " << numThreads << endl; omp_set_num_threads(numThreads); if (printLog != SILENT) clog << "Extract unigram & bigram" << endl; if (printLog != SILENT) clog << "Make Feature table" << endl; int featureNum = (int)tokenVector.size(); featureSpace = NULL; featureSpace = Malloc(feature_node*, featureNum); memset(featureSpace, 0, sizeof(feature_node*)* featureNum); int uniMaxIndex = uniGrams.mDictionary.rbegin()->first + 1; int biMaxIndex = biGrams.mDictionary.rbegin()->first + 1; int jourMaxIndex = journalSet.mJournals.rbegin()->first + 1; if (printLog != SILENT) clog << "Extract features parallel" << endl; #pragma omp parallel for schedule(dynamic) for (int i = 0; i < (int)tokenVector.size(); i++) { FeatureSet tabAllFeatures; tabAllFeatures.mMaxIndex = uniMaxIndex; tabAllFeatures.mFeatures.resize(1); FeatureSet tabBiFeatures; tabBiFeatures.mMaxIndex = biMaxIndex; tabBiFeatures.mFeatures.resize(1); uniGrams.Extract(*tokenVector[i], tabAllFeatures.mFeatures[0]); biGrams.Extract(*tokenVector[i], tabBiFeatures.mFeatures[0]); FeatureSet tabJourFeatures; tabJourFeatures.mMaxIndex = jourMaxIndex; tabJourFeatures.mFeatures.resize(1); if (citationSet[tokenVector[i]->mPmid]->mJournalTitle != NULL) { Journal* ptrJournal = journalSet.SearchJournalTitle(citationSet[tokenVector[i]->mPmid]->mJournalTitle); if (ptrJournal != NULL) { tabJourFeatures.mFeatures[0][ptrJournal->mJournalId] = 1.0; } else cerr << "Error: \"" << citationSet[tokenVector[i]->mPmid]->mJournalTitle << "\" can't find journal in journal set in pmid " << tokenVector[i]->mPmid << endl; } else cerr << "Error: " << tokenVector[i]->mPmid << " can't find journal title in citation" << endl; tabAllFeatures.Merge(tabBiFeatures); tabAllFeatures.Merge(tabJourFeatures); tabAllFeatures.Normalize(); featureSpace[i] = NULL; LinearMachine::TransFeatures(featureSpace[i], tabAllFeatures.mFeatures[0]); } return 0; }
int SaveNeighbor() { int rtn = 0; LhtcDocumentSet lshtcTrainSet, lshtcTestSet; UniGramFeature uniGrams; string trainsetFile = "../data/loc_train.bin"; string testsetFile = "../data/loc_test.bin"; vector<Feature> lshtcTrainFeatureSet, lshtcTestFeatureSet; vector<int> lshtcTrainFeatureID, lshtcTestFeatureID; Feature tempFeature; lshtcTrainFeatureSet.clear(); lshtcTestFeatureSet.clear(); lshtcTrainFeatureID.clear(); lshtcTestFeatureID.clear(); clog << "Load Unigram Dictionary" << endl; rtn = uniGrams.Load("lshtc_unigram_dictionary_loctrain.bin"); CHECK_RTN(rtn); clog << "Total " << uniGrams.mDictionary.size() << " unigrams" << endl; rtn = lshtcTrainSet.LoadBin(trainsetFile, FULL_LOG); CHECK_RTN(rtn); int trainSize = (int)lshtcTrainSet.Size(); for (std::map<int, LhtcDocument>::iterator it = lshtcTrainSet.mLhtcDocuments.begin(); it != lshtcTrainSet.mLhtcDocuments.end(); ++it) lshtcTrainFeatureID.push_back(it->first); vector<LhtcDocument*> vecTrainDocument; vecTrainDocument.reserve(lshtcTrainSet.Size()); for (map<int, LhtcDocument>::iterator it = lshtcTrainSet.mLhtcDocuments.begin(); it != lshtcTrainSet.mLhtcDocuments.end(); ++it) vecTrainDocument.push_back(&(it->second)); clog << "Prepare for Extract Features" << endl; FeatureSet allTrainFeatures; allTrainFeatures.mFeatures.resize(vecTrainDocument.size()); #pragma omp parallel for schedule(dynamic) for (int i = 0; i < (int)vecTrainDocument.size(); i++) { uniGrams.ExtractLhtc(*vecTrainDocument[i], allTrainFeatures.mFeatures[i]); if (allTrainFeatures.mFeatures[i].size() == 0) printf("%d Warning!!\n", i); } allTrainFeatures.Normalize();//get traindata feature rtn = lshtcTestSet.LoadBin(testsetFile, FULL_LOG); CHECK_RTN(rtn); int testSize = (int)lshtcTestSet.Size(); for (std::map<int, LhtcDocument>::iterator it = lshtcTestSet.mLhtcDocuments.begin(); it != lshtcTestSet.mLhtcDocuments.end(); ++it) lshtcTestFeatureID.push_back(it->first); vector<LhtcDocument*> vecTestDocument; vecTestDocument.reserve(lshtcTestSet.Size()); for (map<int, LhtcDocument>::iterator it = lshtcTestSet.mLhtcDocuments.begin(); it != lshtcTestSet.mLhtcDocuments.end(); ++it) vecTestDocument.push_back(&(it->second)); clog << "Prepare for Extract Features" << endl; FeatureSet allTestFeatures; allTestFeatures.mFeatures.resize(vecTestDocument.size()); #pragma omp parallel for schedule(dynamic) for (int i = 0; i < (int)vecTestDocument.size(); i++) { uniGrams.ExtractLhtc(*vecTestDocument[i], allTestFeatures.mFeatures[i]); if (allTestFeatures.mFeatures[i].size() == 0) printf("%d Warning!!\n", i); } allTestFeatures.Normalize();//get testdata feature int sigSize = allTestFeatures.Size() / 5; for (int i = 0; i < 5; ++i) { string filename = "../data/lshtc_neighbor" + intToString(i) + ".bin"; if (FileExist(filename)) continue; clog << i << "th, sigSize = " << sigSize << endl; FeatureSet locFeatures; vector<int> locIds; for (int j = sigSize*i; j < sigSize*(i + 1); ++j) { locFeatures.AddInstance(allTestFeatures[j]); locIds.push_back(lshtcTestFeatureID[j]); } FeatureNeighbor featureneighbor; rtn = featureneighbor.Build(allTrainFeatures.mFeatures, locFeatures.mFeatures, lshtcTrainFeatureID, locIds); CHECK_RTN(rtn); rtn = featureneighbor.SaveBin(filename, STATUS_ONLY); CHECK_RTN(rtn); clog << "Save bin completed" << endl; } return 0; }
//only unigram feature,predictscore saved by model, scores predicted by one model save in one line int SavePredictScore(string tokenPath, string uniGramFile, string labelFreqFile, string modelPath, const int modelNum, string scoreFilePath) { int rtn = 0; clog << "Loading Tokenization Result" << endl; LhtcDocumentSet tokenDocuments; rtn = tokenDocuments.LoadBin(tokenPath.c_str(), STATUS_ONLY);//"pratest_6020.bin" CHECK_RTN(rtn); clog << "Load Unigram Dictionary" << endl; UniGramFeature uniGrams; rtn = uniGrams.Load(uniGramFile.c_str()); CHECK_RTN(rtn); clog << "Total " << uniGrams.mDictionary.size() << " unigrams" << endl; clog << "Load Label Frequence" << endl; map<int, double> labelFreq; rtn = LoadLabelFreq(labelFreqFile.c_str(), labelFreq); CHECK_RTN(rtn); vector<pair<int, double> > meshSort; for (map<int, double>::iterator it = labelFreq.begin(); it != labelFreq.end(); ++it) meshSort.push_back(make_pair(it->first, it->second)); sort(meshSort.begin(), meshSort.end(), CmpScore); vector<int> modelIds; modelIds.clear(); for (size_t i = 0; i < (size_t)modelNum && i < meshSort.size(); ++i) { string modelFile = modelPath + "/" + intToString(meshSort[i].first) + ".model"; if (FileExist(modelFile)) { modelIds.push_back(meshSort[i].first); } } clog << modelIds.size() << " Models Available" << endl; vector<LhtcDocument*> tokenDocVector; tokenDocVector.reserve(tokenDocuments.Size()); for (map<int, LhtcDocument>::iterator it = tokenDocuments.mLhtcDocuments.begin(); it != tokenDocuments.mLhtcDocuments.end(); it++) { tokenDocVector.push_back(&(it->second)); } clog << "Prepare for Extract Features" << endl; FeatureSet allFeatures; allFeatures.mFeatures.resize(tokenDocVector.size()); #pragma omp parallel for schedule(dynamic) for (int i = 0; i < (int)tokenDocVector.size(); i++) { uniGrams.ExtractLhtc(*tokenDocVector[i], allFeatures.mFeatures[i]); } allFeatures.Normalize(); vector<int> pmids; for (map<int, LhtcDocument>::iterator it = tokenDocuments.mLhtcDocuments.begin(); it != tokenDocuments.mLhtcDocuments.end(); ++it) pmids.push_back(it->first); if (pmids.size() != (size_t)allFeatures.Size()) { clog << "Error: pmids.size != allFeatures.size" << endl; return -1; } //clog << "Free Memory" << endl; //tokenCitations.~TokenCitationSet(); FILE * outScoreFile = fopen(scoreFilePath.c_str(), "wb"); if (outScoreFile == NULL) return -1; clog << "Start Predict" << endl; int numThreads = omp_get_num_procs(); omp_set_num_threads(numThreads); rtn = Write(outScoreFile, modelIds.size());//(size_t)modelIds.size() CHECK_RTN(rtn); for (unsigned int k = 0; k < modelIds.size(); k++) { if ((k & 255) == 0) { clog << "LOG : Working for model " << modelIds[k] << endl; } string modelFile = modelPath + "/" + intToString(modelIds[k]) + ".model"; LinearMachine linearMachine; rtn = linearMachine.Load(modelFile); CHECK_RTN(rtn); pair<int, vector<pair<int, double>>> modelScore; modelScore.first = modelIds[k]; modelScore.second.resize(allFeatures.Size()); #pragma omp parallel for schedule(dynamic) for (int i = 0; i < allFeatures.Size(); i++) { double tmpScore; modelScore.second[i].first = pmids[i]; linearMachine.Predict(allFeatures[i], modelScore.second[i].second); } rtn = Write(outScoreFile, modelScore); CHECK_RTN(rtn); } fclose(outScoreFile); outScoreFile = NULL; clog << "Save Complete" << endl; return 0; }