int FeatureSet::Merge(FeatureSet& anotherFeatureSet) { if (Size() != anotherFeatureSet.Size()) { cerr << "Error: the sizes of two FeatureSets are not equal.Can't merge!" << endl; return -1; } for (int i = 0; i < Size(); ++i) { for (Feature::iterator it = anotherFeatureSet.mFeatures[i].begin(); it != anotherFeatureSet.mFeatures[i].end(); ++it) { mFeatures[i][mMaxIndex + 1 + it->first] = it->second; } } mMaxIndex += anotherFeatureSet.mMaxIndex + 1; return 0; }
int MeshInfoSet::InitializeMeshEntry(CitationSet& citationSet, MeshRecordSet& meshRecords, EntryMapFeature& entryMap, int printLog) { int rtn = 0; mTitleMeshEntryOcurNum.clear(); mTitleMeshEntryPairOcurNum.clear(); mAbstractMeshEntryOcurNum.clear(); mAbstractMeshEntryPairOcurNum.clear(); vector<Citation*> citationVector; citationVector.reserve(citationSet.Size()); for (map<int, Citation*>::iterator it = citationSet.mCitations.begin(); it != citationSet.mCitations.end(); it++) citationVector.push_back(it->second); if (printLog != SILENT) clog << "Extract title entry feature" << endl; FeatureSet titleFeature; rtn = entryMap.ExtractTitle(citationVector, titleFeature); CHECK_RTN(rtn); if (printLog != SILENT) clog << "Extract abstract entry feature" << endl; FeatureSet abstractFeature; rtn = entryMap.ExtractAbstract(citationVector, abstractFeature); CHECK_RTN(rtn); for (int i = 0; i < titleFeature.Size(); ++i) { if (printLog != SILENT && (i & ((1 << 18) - 1)) == 0) clog << "\r" << i << " citation title counting"; for (Feature::iterator it = titleFeature[i].begin(); it != titleFeature[i].end(); ++it) { if (it->second > 0.0) { rtn = AddMeshNum(it->first, mTitleMeshEntryOcurNum); CHECK_RTN(rtn); Feature::iterator it2 = it; while ((++it2) != titleFeature[i].end()) { if (it2->second > 0) { rtn = AddMeshPairNum(it->first, it2->first, mTitleMeshEntryPairOcurNum); CHECK_RTN(rtn); } } } } } if (printLog != SILENT) { clog << "\nTotal " << mTitleMeshEntryOcurNum.size() << " meshs occur entry in title" << endl; clog << "Total " << mTitleMeshEntryPairOcurNum.size() << " meshs pair occur entry in title" << endl; } for (int i = 0; i < abstractFeature.Size(); ++i) { if (printLog != SILENT && (i & ((1 << 18) - 1)) == 0) clog << "\r" << i << " citation abstract counting"; for (Feature::iterator it = abstractFeature[i].begin(); it != abstractFeature[i].end(); ++it) { if (it->second > 0.0) { rtn = AddMeshNum(it->first, mAbstractMeshEntryOcurNum); CHECK_RTN(rtn); Feature::iterator it2 = it; while ((++it2) != abstractFeature[i].end()) { if (it2->second > 0) { rtn = AddMeshPairNum(it->first, it2->first, mAbstractMeshEntryPairOcurNum); CHECK_RTN(rtn); } } } } } if (printLog != SILENT) { clog << "\nTotal " << mAbstractMeshEntryOcurNum.size() << " meshs occur entry in abstract" << endl; clog << "Total " << mAbstractMeshEntryPairOcurNum.size() << " meshs pair occur entry in abstract" << endl; } return 0; }
int SaveNeighbor() { int rtn = 0; LhtcDocumentSet lshtcTrainSet, lshtcTestSet; UniGramFeature uniGrams; string trainsetFile = "../data/loc_train.bin"; string testsetFile = "../data/loc_test.bin"; vector<Feature> lshtcTrainFeatureSet, lshtcTestFeatureSet; vector<int> lshtcTrainFeatureID, lshtcTestFeatureID; Feature tempFeature; lshtcTrainFeatureSet.clear(); lshtcTestFeatureSet.clear(); lshtcTrainFeatureID.clear(); lshtcTestFeatureID.clear(); clog << "Load Unigram Dictionary" << endl; rtn = uniGrams.Load("lshtc_unigram_dictionary_loctrain.bin"); CHECK_RTN(rtn); clog << "Total " << uniGrams.mDictionary.size() << " unigrams" << endl; rtn = lshtcTrainSet.LoadBin(trainsetFile, FULL_LOG); CHECK_RTN(rtn); int trainSize = (int)lshtcTrainSet.Size(); for (std::map<int, LhtcDocument>::iterator it = lshtcTrainSet.mLhtcDocuments.begin(); it != lshtcTrainSet.mLhtcDocuments.end(); ++it) lshtcTrainFeatureID.push_back(it->first); vector<LhtcDocument*> vecTrainDocument; vecTrainDocument.reserve(lshtcTrainSet.Size()); for (map<int, LhtcDocument>::iterator it = lshtcTrainSet.mLhtcDocuments.begin(); it != lshtcTrainSet.mLhtcDocuments.end(); ++it) vecTrainDocument.push_back(&(it->second)); clog << "Prepare for Extract Features" << endl; FeatureSet allTrainFeatures; allTrainFeatures.mFeatures.resize(vecTrainDocument.size()); #pragma omp parallel for schedule(dynamic) for (int i = 0; i < (int)vecTrainDocument.size(); i++) { uniGrams.ExtractLhtc(*vecTrainDocument[i], allTrainFeatures.mFeatures[i]); if (allTrainFeatures.mFeatures[i].size() == 0) printf("%d Warning!!\n", i); } allTrainFeatures.Normalize();//get traindata feature rtn = lshtcTestSet.LoadBin(testsetFile, FULL_LOG); CHECK_RTN(rtn); int testSize = (int)lshtcTestSet.Size(); for (std::map<int, LhtcDocument>::iterator it = lshtcTestSet.mLhtcDocuments.begin(); it != lshtcTestSet.mLhtcDocuments.end(); ++it) lshtcTestFeatureID.push_back(it->first); vector<LhtcDocument*> vecTestDocument; vecTestDocument.reserve(lshtcTestSet.Size()); for (map<int, LhtcDocument>::iterator it = lshtcTestSet.mLhtcDocuments.begin(); it != lshtcTestSet.mLhtcDocuments.end(); ++it) vecTestDocument.push_back(&(it->second)); clog << "Prepare for Extract Features" << endl; FeatureSet allTestFeatures; allTestFeatures.mFeatures.resize(vecTestDocument.size()); #pragma omp parallel for schedule(dynamic) for (int i = 0; i < (int)vecTestDocument.size(); i++) { uniGrams.ExtractLhtc(*vecTestDocument[i], allTestFeatures.mFeatures[i]); if (allTestFeatures.mFeatures[i].size() == 0) printf("%d Warning!!\n", i); } allTestFeatures.Normalize();//get testdata feature int sigSize = allTestFeatures.Size() / 5; for (int i = 0; i < 5; ++i) { string filename = "../data/lshtc_neighbor" + intToString(i) + ".bin"; if (FileExist(filename)) continue; clog << i << "th, sigSize = " << sigSize << endl; FeatureSet locFeatures; vector<int> locIds; for (int j = sigSize*i; j < sigSize*(i + 1); ++j) { locFeatures.AddInstance(allTestFeatures[j]); locIds.push_back(lshtcTestFeatureID[j]); } FeatureNeighbor featureneighbor; rtn = featureneighbor.Build(allTrainFeatures.mFeatures, locFeatures.mFeatures, lshtcTrainFeatureID, locIds); CHECK_RTN(rtn); rtn = featureneighbor.SaveBin(filename, STATUS_ONLY); CHECK_RTN(rtn); clog << "Save bin completed" << endl; } return 0; }
//only unigram feature,predictscore saved by model, scores predicted by one model save in one line int SavePredictScore(string tokenPath, string uniGramFile, string labelFreqFile, string modelPath, const int modelNum, string scoreFilePath) { int rtn = 0; clog << "Loading Tokenization Result" << endl; LhtcDocumentSet tokenDocuments; rtn = tokenDocuments.LoadBin(tokenPath.c_str(), STATUS_ONLY);//"pratest_6020.bin" CHECK_RTN(rtn); clog << "Load Unigram Dictionary" << endl; UniGramFeature uniGrams; rtn = uniGrams.Load(uniGramFile.c_str()); CHECK_RTN(rtn); clog << "Total " << uniGrams.mDictionary.size() << " unigrams" << endl; clog << "Load Label Frequence" << endl; map<int, double> labelFreq; rtn = LoadLabelFreq(labelFreqFile.c_str(), labelFreq); CHECK_RTN(rtn); vector<pair<int, double> > meshSort; for (map<int, double>::iterator it = labelFreq.begin(); it != labelFreq.end(); ++it) meshSort.push_back(make_pair(it->first, it->second)); sort(meshSort.begin(), meshSort.end(), CmpScore); vector<int> modelIds; modelIds.clear(); for (size_t i = 0; i < (size_t)modelNum && i < meshSort.size(); ++i) { string modelFile = modelPath + "/" + intToString(meshSort[i].first) + ".model"; if (FileExist(modelFile)) { modelIds.push_back(meshSort[i].first); } } clog << modelIds.size() << " Models Available" << endl; vector<LhtcDocument*> tokenDocVector; tokenDocVector.reserve(tokenDocuments.Size()); for (map<int, LhtcDocument>::iterator it = tokenDocuments.mLhtcDocuments.begin(); it != tokenDocuments.mLhtcDocuments.end(); it++) { tokenDocVector.push_back(&(it->second)); } clog << "Prepare for Extract Features" << endl; FeatureSet allFeatures; allFeatures.mFeatures.resize(tokenDocVector.size()); #pragma omp parallel for schedule(dynamic) for (int i = 0; i < (int)tokenDocVector.size(); i++) { uniGrams.ExtractLhtc(*tokenDocVector[i], allFeatures.mFeatures[i]); } allFeatures.Normalize(); vector<int> pmids; for (map<int, LhtcDocument>::iterator it = tokenDocuments.mLhtcDocuments.begin(); it != tokenDocuments.mLhtcDocuments.end(); ++it) pmids.push_back(it->first); if (pmids.size() != (size_t)allFeatures.Size()) { clog << "Error: pmids.size != allFeatures.size" << endl; return -1; } //clog << "Free Memory" << endl; //tokenCitations.~TokenCitationSet(); FILE * outScoreFile = fopen(scoreFilePath.c_str(), "wb"); if (outScoreFile == NULL) return -1; clog << "Start Predict" << endl; int numThreads = omp_get_num_procs(); omp_set_num_threads(numThreads); rtn = Write(outScoreFile, modelIds.size());//(size_t)modelIds.size() CHECK_RTN(rtn); for (unsigned int k = 0; k < modelIds.size(); k++) { if ((k & 255) == 0) { clog << "LOG : Working for model " << modelIds[k] << endl; } string modelFile = modelPath + "/" + intToString(modelIds[k]) + ".model"; LinearMachine linearMachine; rtn = linearMachine.Load(modelFile); CHECK_RTN(rtn); pair<int, vector<pair<int, double>>> modelScore; modelScore.first = modelIds[k]; modelScore.second.resize(allFeatures.Size()); #pragma omp parallel for schedule(dynamic) for (int i = 0; i < allFeatures.Size(); i++) { double tmpScore; modelScore.second[i].first = pmids[i]; linearMachine.Predict(allFeatures[i], modelScore.second[i].second); } rtn = Write(outScoreFile, modelScore); CHECK_RTN(rtn); } fclose(outScoreFile); outScoreFile = NULL; clog << "Save Complete" << endl; return 0; }