예제 #1
0
int FeatureSet::Merge(FeatureSet& anotherFeatureSet)
{
	if (Size() != anotherFeatureSet.Size())
	{
		cerr << "Error: the sizes of two FeatureSets are not equal.Can't merge!" << endl;
		return -1;
	}

	for (int i = 0; i < Size(); ++i)
	{
		for (Feature::iterator it = anotherFeatureSet.mFeatures[i].begin(); it != anotherFeatureSet.mFeatures[i].end(); ++it)
		{
			mFeatures[i][mMaxIndex + 1 + it->first] = it->second;
		}
	}
	mMaxIndex += anotherFeatureSet.mMaxIndex + 1;

	return 0;
}
예제 #2
0
int MeshInfoSet::InitializeMeshEntry(CitationSet& citationSet, MeshRecordSet& meshRecords, EntryMapFeature& entryMap, int printLog)
{
	int rtn = 0;

	mTitleMeshEntryOcurNum.clear();
	mTitleMeshEntryPairOcurNum.clear();
	mAbstractMeshEntryOcurNum.clear();
	mAbstractMeshEntryPairOcurNum.clear();

	vector<Citation*> citationVector;
	citationVector.reserve(citationSet.Size());
	for (map<int, Citation*>::iterator it = citationSet.mCitations.begin(); it != citationSet.mCitations.end(); it++)
		citationVector.push_back(it->second);

	if (printLog != SILENT)
		clog << "Extract title entry feature" << endl;
	FeatureSet titleFeature;
	rtn = entryMap.ExtractTitle(citationVector, titleFeature);
	CHECK_RTN(rtn);

	if (printLog != SILENT)
		clog << "Extract abstract entry feature" << endl;
	FeatureSet abstractFeature;
	rtn = entryMap.ExtractAbstract(citationVector, abstractFeature);
	CHECK_RTN(rtn);

	for (int i = 0; i < titleFeature.Size(); ++i)
	{
		if (printLog != SILENT && (i & ((1 << 18) - 1)) == 0)
			clog << "\r" << i << " citation title counting";
		for (Feature::iterator it = titleFeature[i].begin(); it != titleFeature[i].end(); ++it)
		{
			if (it->second > 0.0)
			{
				rtn = AddMeshNum(it->first, mTitleMeshEntryOcurNum);
				CHECK_RTN(rtn);

				Feature::iterator it2 = it;
				while ((++it2) != titleFeature[i].end())
				{
					if (it2->second > 0)
					{
						rtn = AddMeshPairNum(it->first, it2->first, mTitleMeshEntryPairOcurNum);
						CHECK_RTN(rtn);
					}
				}
			}
		}
	}

	if (printLog != SILENT)
	{
		clog << "\nTotal " << mTitleMeshEntryOcurNum.size() << " meshs occur entry in title" << endl;
		clog << "Total " << mTitleMeshEntryPairOcurNum.size() << " meshs pair occur entry in title" << endl;
	}

	for (int i = 0; i < abstractFeature.Size(); ++i)
	{
		if (printLog != SILENT && (i & ((1 << 18) - 1)) == 0)
			clog << "\r" << i << " citation abstract counting";
		for (Feature::iterator it = abstractFeature[i].begin(); it != abstractFeature[i].end(); ++it)
		{
			if (it->second > 0.0)
			{
				rtn = AddMeshNum(it->first, mAbstractMeshEntryOcurNum);
				CHECK_RTN(rtn);

				Feature::iterator it2 = it;
				while ((++it2) != abstractFeature[i].end())
				{
					if (it2->second > 0)
					{
						rtn = AddMeshPairNum(it->first, it2->first, mAbstractMeshEntryPairOcurNum);
						CHECK_RTN(rtn);
					}
				}
			}
		}
	}
	if (printLog != SILENT)
	{
		clog << "\nTotal " << mAbstractMeshEntryOcurNum.size() << " meshs occur entry in abstract" << endl;
		clog << "Total " << mAbstractMeshEntryPairOcurNum.size() << " meshs pair occur entry in abstract" << endl;
	}
	return 0;
}
예제 #3
0
int SaveNeighbor()
{
	int rtn = 0;
	LhtcDocumentSet lshtcTrainSet, lshtcTestSet;
	UniGramFeature uniGrams;
	string trainsetFile = "../data/loc_train.bin";
	string testsetFile = "../data/loc_test.bin";
	vector<Feature> lshtcTrainFeatureSet, lshtcTestFeatureSet;
	vector<int> lshtcTrainFeatureID, lshtcTestFeatureID;
	Feature tempFeature;
	lshtcTrainFeatureSet.clear();
	lshtcTestFeatureSet.clear();
	lshtcTrainFeatureID.clear();
	lshtcTestFeatureID.clear();

	clog << "Load Unigram Dictionary" << endl;
	rtn = uniGrams.Load("lshtc_unigram_dictionary_loctrain.bin");
	CHECK_RTN(rtn);
	clog << "Total " << uniGrams.mDictionary.size() << " unigrams" << endl;

	rtn = lshtcTrainSet.LoadBin(trainsetFile, FULL_LOG);
	CHECK_RTN(rtn);

	int trainSize = (int)lshtcTrainSet.Size();
	for (std::map<int, LhtcDocument>::iterator it = lshtcTrainSet.mLhtcDocuments.begin(); it != lshtcTrainSet.mLhtcDocuments.end(); ++it)
		lshtcTrainFeatureID.push_back(it->first);

	vector<LhtcDocument*> vecTrainDocument;
	vecTrainDocument.reserve(lshtcTrainSet.Size());
	for (map<int, LhtcDocument>::iterator it = lshtcTrainSet.mLhtcDocuments.begin(); it != lshtcTrainSet.mLhtcDocuments.end(); ++it)
		vecTrainDocument.push_back(&(it->second));

	clog << "Prepare for Extract Features" << endl;
	FeatureSet allTrainFeatures;
	allTrainFeatures.mFeatures.resize(vecTrainDocument.size());
#pragma omp parallel for schedule(dynamic)
	for (int i = 0; i < (int)vecTrainDocument.size(); i++)
	{
		uniGrams.ExtractLhtc(*vecTrainDocument[i], allTrainFeatures.mFeatures[i]);
		if (allTrainFeatures.mFeatures[i].size() == 0) printf("%d Warning!!\n", i);
	}
	allTrainFeatures.Normalize();//get traindata feature

	rtn = lshtcTestSet.LoadBin(testsetFile, FULL_LOG);
	CHECK_RTN(rtn);

	int testSize = (int)lshtcTestSet.Size();
	for (std::map<int, LhtcDocument>::iterator it = lshtcTestSet.mLhtcDocuments.begin(); it != lshtcTestSet.mLhtcDocuments.end(); ++it)
		lshtcTestFeatureID.push_back(it->first);

	vector<LhtcDocument*> vecTestDocument;
	vecTestDocument.reserve(lshtcTestSet.Size());
	for (map<int, LhtcDocument>::iterator it = lshtcTestSet.mLhtcDocuments.begin(); it != lshtcTestSet.mLhtcDocuments.end(); ++it)
		vecTestDocument.push_back(&(it->second));

	clog << "Prepare for Extract Features" << endl;
	FeatureSet allTestFeatures;
	allTestFeatures.mFeatures.resize(vecTestDocument.size());
#pragma omp parallel for schedule(dynamic)
	for (int i = 0; i < (int)vecTestDocument.size(); i++)
	{
		uniGrams.ExtractLhtc(*vecTestDocument[i], allTestFeatures.mFeatures[i]);
		if (allTestFeatures.mFeatures[i].size() == 0) printf("%d Warning!!\n", i);
	}
	allTestFeatures.Normalize();//get testdata feature

	int sigSize = allTestFeatures.Size() / 5;
	for (int i = 0; i < 5; ++i)
	{
		string filename = "../data/lshtc_neighbor" + intToString(i) + ".bin";
		if (FileExist(filename))
			continue;
		clog << i << "th, sigSize = " << sigSize << endl;
		FeatureSet locFeatures;
		vector<int> locIds;
		for (int j = sigSize*i; j < sigSize*(i + 1); ++j)
		{
			locFeatures.AddInstance(allTestFeatures[j]);
			locIds.push_back(lshtcTestFeatureID[j]);
		}
		FeatureNeighbor featureneighbor;
		rtn = featureneighbor.Build(allTrainFeatures.mFeatures, locFeatures.mFeatures, lshtcTrainFeatureID, locIds);
		CHECK_RTN(rtn);

		rtn = featureneighbor.SaveBin(filename, STATUS_ONLY);
		CHECK_RTN(rtn);
		clog << "Save bin completed" << endl;
	}

	return 0;
}
예제 #4
0
//only unigram feature,predictscore saved by model, scores predicted by one model save in one line
int SavePredictScore(string tokenPath, string uniGramFile, string labelFreqFile, string modelPath, const int modelNum, string scoreFilePath)
{
	int rtn = 0;

	clog << "Loading Tokenization Result" << endl;
	LhtcDocumentSet tokenDocuments;
	rtn = tokenDocuments.LoadBin(tokenPath.c_str(), STATUS_ONLY);//"pratest_6020.bin"
	CHECK_RTN(rtn);

	clog << "Load Unigram Dictionary" << endl;
	UniGramFeature uniGrams;
	rtn = uniGrams.Load(uniGramFile.c_str());
	CHECK_RTN(rtn);
	clog << "Total " << uniGrams.mDictionary.size() << " unigrams" << endl;

	clog << "Load Label Frequence" << endl;
	map<int, double> labelFreq;
	rtn = LoadLabelFreq(labelFreqFile.c_str(), labelFreq);
	CHECK_RTN(rtn);

	vector<pair<int, double> > meshSort;
	for (map<int, double>::iterator it = labelFreq.begin(); it != labelFreq.end(); ++it)
		meshSort.push_back(make_pair(it->first, it->second));
	sort(meshSort.begin(), meshSort.end(), CmpScore);

	vector<int> modelIds;
	modelIds.clear();
	for (size_t i = 0; i < (size_t)modelNum && i < meshSort.size(); ++i)
	{
		string modelFile = modelPath + "/" + intToString(meshSort[i].first) + ".model";
		if (FileExist(modelFile))
		{
			modelIds.push_back(meshSort[i].first);
		}
	}
	clog << modelIds.size() << " Models Available" << endl;

	vector<LhtcDocument*> tokenDocVector;
	tokenDocVector.reserve(tokenDocuments.Size());
	for (map<int, LhtcDocument>::iterator it = tokenDocuments.mLhtcDocuments.begin(); it != tokenDocuments.mLhtcDocuments.end(); it++)
	{
		tokenDocVector.push_back(&(it->second));
	}

	clog << "Prepare for Extract Features" << endl;
	FeatureSet allFeatures;
	allFeatures.mFeatures.resize(tokenDocVector.size());
#pragma omp parallel for schedule(dynamic)
	for (int i = 0; i < (int)tokenDocVector.size(); i++)
	{
		uniGrams.ExtractLhtc(*tokenDocVector[i], allFeatures.mFeatures[i]);
	}
	allFeatures.Normalize();

	vector<int> pmids;
	for (map<int, LhtcDocument>::iterator it = tokenDocuments.mLhtcDocuments.begin(); it != tokenDocuments.mLhtcDocuments.end(); ++it)
		pmids.push_back(it->first);

	if (pmids.size() != (size_t)allFeatures.Size())
	{
		clog << "Error: pmids.size != allFeatures.size" << endl;
		return -1;
	}

	//clog << "Free Memory" << endl;
	//tokenCitations.~TokenCitationSet();

	FILE * outScoreFile = fopen(scoreFilePath.c_str(), "wb");
	if (outScoreFile == NULL)
		return -1;
	clog << "Start Predict" << endl;
	int numThreads = omp_get_num_procs();
	omp_set_num_threads(numThreads);
	rtn = Write(outScoreFile, modelIds.size());//(size_t)modelIds.size()
	CHECK_RTN(rtn);
	for (unsigned int k = 0; k < modelIds.size(); k++)
	{
		if ((k & 255) == 0)
		{
			clog << "LOG : Working for model " << modelIds[k] << endl;
		}
		string modelFile = modelPath + "/" + intToString(modelIds[k]) + ".model";
		LinearMachine linearMachine;
		rtn = linearMachine.Load(modelFile);
		CHECK_RTN(rtn);
		pair<int, vector<pair<int, double>>> modelScore;
		modelScore.first = modelIds[k];
		modelScore.second.resize(allFeatures.Size());
#pragma omp parallel for schedule(dynamic)
		for (int i = 0; i < allFeatures.Size(); i++)
		{
			double tmpScore;
			modelScore.second[i].first = pmids[i];
			linearMachine.Predict(allFeatures[i], modelScore.second[i].second);
		}
		rtn = Write(outScoreFile, modelScore);
		CHECK_RTN(rtn);
	}
	fclose(outScoreFile);
	outScoreFile = NULL;
	clog << "Save Complete" << endl;
	return 0;
}