Ejemplo n.º 1
0
int MetalabelFeature::ExtractFeature(const vector<TokenCitation*>& tokenVector, CitationSet& citationSet, UniGramFeature& uniGrams, BiGramFeature& biGrams, JournalSet& journalSet, FeatureSet& allFeatures, int printLog)
{
	int rtn = 0;
	allFeatures.mFeatures.clear();
	allFeatures.mMaxIndex = uniGrams.mDictionary.rbegin()->first + 1;
	allFeatures.mFeatures.resize(tokenVector.size());

	FeatureSet biFeatures;
	biFeatures.mMaxIndex = biGrams.mDictionary.rbegin()->first + 1;
	biFeatures.mFeatures.resize(tokenVector.size());

	FeatureSet jourFeatures;
	jourFeatures.mMaxIndex = journalSet.mJournals.rbegin()->first + 1;
	jourFeatures.mFeatures.resize(tokenVector.size());

	int numThreads = omp_get_num_procs();
	if (printLog != SILENT)
		clog << "CPU number: " << numThreads << endl;

	omp_set_num_threads(numThreads);
	if (printLog != SILENT)
		clog << "Start Parallel Extract Features" << endl;
#pragma omp parallel for schedule(dynamic) 
	for (int i = 0; i < tokenVector.size(); i++)
	{
		uniGrams.Extract(*tokenVector[i], allFeatures.mFeatures[i]);
	}

#pragma omp parallel for schedule(dynamic) 
	for (int i = 0; i < tokenVector.size(); i++)
	{
		biGrams.Extract(*tokenVector[i], biFeatures.mFeatures[i]);
	}

#pragma omp parallel for schedule(dynamic)
	for (int i = 0; i < tokenVector.size(); ++i)
	{
		Journal* ptrJournal = NULL;
		ptrJournal = journalSet.SearchJournalTitle(citationSet[tokenVector[i]->mPmid]->mJournalTitle);
		if (ptrJournal != NULL)
		{
			jourFeatures.mFeatures[i][ptrJournal->mJournalId] = 1.0;
		}
		else
		{
			cerr << "Error: can't find \"" << citationSet[tokenVector[i]->mPmid]->mJournalTitle << " in pmid " << tokenVector[i]->mPmid << endl;
		}
	}

	rtn = allFeatures.Merge(biFeatures);
	CHECK_RTN(rtn);
	rtn = allFeatures.Merge(jourFeatures);
	CHECK_RTN(rtn);
	rtn = allFeatures.Normalize();
	CHECK_RTN(rtn);
	return 0;
}
Ejemplo n.º 2
0
int MetalabelFeature::ExtractFeature(const vector<TokenCitation*>& tokenVector, UniGramFeature& uniGrams, BiGramFeature& biGrams, feature_node** &featureSpace, int printLog)
{
	int numThreads = omp_get_num_procs();
	if (printLog != SILENT)
		clog << "CPU number: " << numThreads << endl;
	omp_set_num_threads(numThreads);
	if (printLog != SILENT)
		clog << "Extract unigram & bigram" << endl;

	if (printLog != SILENT)
		clog << "Make Feature table" << endl;
	int featureNum = (int)tokenVector.size();
	featureSpace = NULL;
	featureSpace = Malloc(feature_node*, featureNum);
	memset(featureSpace, 0, sizeof(feature_node*)* featureNum);

	int uniMaxIndex = uniGrams.mDictionary.rbegin()->first + 1;
	int biMaxIndex = biGrams.mDictionary.rbegin()->first + 1;

	if (printLog != SILENT)
		clog << "Extract features parallel" << endl;

#pragma omp parallel for schedule(dynamic)
	for (int i = 0; i < (int)tokenVector.size(); i++)
	{
		FeatureSet tabAllFeatures;
		tabAllFeatures.mMaxIndex = uniMaxIndex;
		tabAllFeatures.mFeatures.resize(1);

		FeatureSet tabBiFeatures;
		tabBiFeatures.mMaxIndex = biMaxIndex;
		tabBiFeatures.mFeatures.resize(1);

		uniGrams.Extract(*tokenVector[i], tabAllFeatures.mFeatures[0]);
		biGrams.Extract(*tokenVector[i], tabBiFeatures.mFeatures[0]);

		tabAllFeatures.Merge(tabBiFeatures);
		tabAllFeatures.Normalize();

		featureSpace[i] = NULL;
		LinearMachine::TransFeatures(featureSpace[i], tabAllFeatures.mFeatures[0]);
	}
	return 0;
}
Ejemplo n.º 3
0
int MetalabelFeature::ExtractFeature(const std::vector<TokenCitation*>& tokenVector, UniGramFeature& uniGrams, FeatureSet& allFeatures, int printLog)
{
	int rtn = 0;
	allFeatures.mFeatures.clear();
	allFeatures.mMaxIndex = uniGrams.mDictionary.rbegin()->first + 1;
	allFeatures.mFeatures.resize(tokenVector.size());

	int numThreads = omp_get_num_procs();
	if (printLog != SILENT)
		clog << "CPU number: " << numThreads << endl;

	omp_set_num_threads(numThreads);
	if (printLog != SILENT)
		clog << "Start Parallel Extract Features" << endl;
#pragma omp parallel for schedule(dynamic) 
	for (int i = 0; i < tokenVector.size(); i++)
	{
		uniGrams.Extract(*tokenVector[i], allFeatures.mFeatures[i]);
	}
	rtn = allFeatures.Normalize();
	CHECK_RTN(rtn);

	return 0;
}
Ejemplo n.º 4
0
int MetalabelFeature::ExtractFeature(const vector<TokenCitation*>& tokenVector, CitationSet& citationSet, UniGramFeature& uniGrams, BiGramFeature& biGrams, JournalSet& journalSet, feature_node** &featureSpace, int printLog)
{
	int numThreads = omp_get_num_procs();
	if (printLog != SILENT)
		clog << "CPU number: " << numThreads << endl;
	omp_set_num_threads(numThreads);
	if (printLog != SILENT)
		clog << "Extract unigram & bigram" << endl;

	if (printLog != SILENT)
		clog << "Make Feature table" << endl;
	int featureNum = (int)tokenVector.size();
	featureSpace = NULL;
	featureSpace = Malloc(feature_node*, featureNum);
	memset(featureSpace, 0, sizeof(feature_node*)* featureNum);

	int uniMaxIndex = uniGrams.mDictionary.rbegin()->first + 1;
	int biMaxIndex = biGrams.mDictionary.rbegin()->first + 1;
	int jourMaxIndex = journalSet.mJournals.rbegin()->first + 1;

	if (printLog != SILENT)
		clog << "Extract features parallel" << endl;

#pragma omp parallel for schedule(dynamic)
	for (int i = 0; i < (int)tokenVector.size(); i++)
	{
		FeatureSet tabAllFeatures;
		tabAllFeatures.mMaxIndex = uniMaxIndex;
		tabAllFeatures.mFeatures.resize(1);

		FeatureSet tabBiFeatures;
		tabBiFeatures.mMaxIndex = biMaxIndex;
		tabBiFeatures.mFeatures.resize(1);

		uniGrams.Extract(*tokenVector[i], tabAllFeatures.mFeatures[0]);
		biGrams.Extract(*tokenVector[i], tabBiFeatures.mFeatures[0]);

		FeatureSet tabJourFeatures;
		tabJourFeatures.mMaxIndex = jourMaxIndex;
		tabJourFeatures.mFeatures.resize(1);
		if (citationSet[tokenVector[i]->mPmid]->mJournalTitle != NULL)
		{
			Journal* ptrJournal = journalSet.SearchJournalTitle(citationSet[tokenVector[i]->mPmid]->mJournalTitle);
			if (ptrJournal != NULL)
			{
				tabJourFeatures.mFeatures[0][ptrJournal->mJournalId] = 1.0;
			}
			else
				cerr << "Error: \"" << citationSet[tokenVector[i]->mPmid]->mJournalTitle << "\" can't find journal in journal set in pmid " << tokenVector[i]->mPmid << endl;
		}
		else
			cerr << "Error: " << tokenVector[i]->mPmid << " can't find journal title in citation" << endl;

		tabAllFeatures.Merge(tabBiFeatures);
		tabAllFeatures.Merge(tabJourFeatures);
		tabAllFeatures.Normalize();

		featureSpace[i] = NULL;
		LinearMachine::TransFeatures(featureSpace[i], tabAllFeatures.mFeatures[0]);
	}
	return 0;
}
Ejemplo n.º 5
0
int SaveNeighbor()
{
	int rtn = 0;
	LhtcDocumentSet lshtcTrainSet, lshtcTestSet;
	UniGramFeature uniGrams;
	string trainsetFile = "../data/loc_train.bin";
	string testsetFile = "../data/loc_test.bin";
	vector<Feature> lshtcTrainFeatureSet, lshtcTestFeatureSet;
	vector<int> lshtcTrainFeatureID, lshtcTestFeatureID;
	Feature tempFeature;
	lshtcTrainFeatureSet.clear();
	lshtcTestFeatureSet.clear();
	lshtcTrainFeatureID.clear();
	lshtcTestFeatureID.clear();

	clog << "Load Unigram Dictionary" << endl;
	rtn = uniGrams.Load("lshtc_unigram_dictionary_loctrain.bin");
	CHECK_RTN(rtn);
	clog << "Total " << uniGrams.mDictionary.size() << " unigrams" << endl;

	rtn = lshtcTrainSet.LoadBin(trainsetFile, FULL_LOG);
	CHECK_RTN(rtn);

	int trainSize = (int)lshtcTrainSet.Size();
	for (std::map<int, LhtcDocument>::iterator it = lshtcTrainSet.mLhtcDocuments.begin(); it != lshtcTrainSet.mLhtcDocuments.end(); ++it)
		lshtcTrainFeatureID.push_back(it->first);

	vector<LhtcDocument*> vecTrainDocument;
	vecTrainDocument.reserve(lshtcTrainSet.Size());
	for (map<int, LhtcDocument>::iterator it = lshtcTrainSet.mLhtcDocuments.begin(); it != lshtcTrainSet.mLhtcDocuments.end(); ++it)
		vecTrainDocument.push_back(&(it->second));

	clog << "Prepare for Extract Features" << endl;
	FeatureSet allTrainFeatures;
	allTrainFeatures.mFeatures.resize(vecTrainDocument.size());
#pragma omp parallel for schedule(dynamic)
	for (int i = 0; i < (int)vecTrainDocument.size(); i++)
	{
		uniGrams.ExtractLhtc(*vecTrainDocument[i], allTrainFeatures.mFeatures[i]);
		if (allTrainFeatures.mFeatures[i].size() == 0) printf("%d Warning!!\n", i);
	}
	allTrainFeatures.Normalize();//get traindata feature

	rtn = lshtcTestSet.LoadBin(testsetFile, FULL_LOG);
	CHECK_RTN(rtn);

	int testSize = (int)lshtcTestSet.Size();
	for (std::map<int, LhtcDocument>::iterator it = lshtcTestSet.mLhtcDocuments.begin(); it != lshtcTestSet.mLhtcDocuments.end(); ++it)
		lshtcTestFeatureID.push_back(it->first);

	vector<LhtcDocument*> vecTestDocument;
	vecTestDocument.reserve(lshtcTestSet.Size());
	for (map<int, LhtcDocument>::iterator it = lshtcTestSet.mLhtcDocuments.begin(); it != lshtcTestSet.mLhtcDocuments.end(); ++it)
		vecTestDocument.push_back(&(it->second));

	clog << "Prepare for Extract Features" << endl;
	FeatureSet allTestFeatures;
	allTestFeatures.mFeatures.resize(vecTestDocument.size());
#pragma omp parallel for schedule(dynamic)
	for (int i = 0; i < (int)vecTestDocument.size(); i++)
	{
		uniGrams.ExtractLhtc(*vecTestDocument[i], allTestFeatures.mFeatures[i]);
		if (allTestFeatures.mFeatures[i].size() == 0) printf("%d Warning!!\n", i);
	}
	allTestFeatures.Normalize();//get testdata feature

	int sigSize = allTestFeatures.Size() / 5;
	for (int i = 0; i < 5; ++i)
	{
		string filename = "../data/lshtc_neighbor" + intToString(i) + ".bin";
		if (FileExist(filename))
			continue;
		clog << i << "th, sigSize = " << sigSize << endl;
		FeatureSet locFeatures;
		vector<int> locIds;
		for (int j = sigSize*i; j < sigSize*(i + 1); ++j)
		{
			locFeatures.AddInstance(allTestFeatures[j]);
			locIds.push_back(lshtcTestFeatureID[j]);
		}
		FeatureNeighbor featureneighbor;
		rtn = featureneighbor.Build(allTrainFeatures.mFeatures, locFeatures.mFeatures, lshtcTrainFeatureID, locIds);
		CHECK_RTN(rtn);

		rtn = featureneighbor.SaveBin(filename, STATUS_ONLY);
		CHECK_RTN(rtn);
		clog << "Save bin completed" << endl;
	}

	return 0;
}
Ejemplo n.º 6
0
//only unigram feature,predictscore saved by model, scores predicted by one model save in one line
int SavePredictScore(string tokenPath, string uniGramFile, string labelFreqFile, string modelPath, const int modelNum, string scoreFilePath)
{
	int rtn = 0;

	clog << "Loading Tokenization Result" << endl;
	LhtcDocumentSet tokenDocuments;
	rtn = tokenDocuments.LoadBin(tokenPath.c_str(), STATUS_ONLY);//"pratest_6020.bin"
	CHECK_RTN(rtn);

	clog << "Load Unigram Dictionary" << endl;
	UniGramFeature uniGrams;
	rtn = uniGrams.Load(uniGramFile.c_str());
	CHECK_RTN(rtn);
	clog << "Total " << uniGrams.mDictionary.size() << " unigrams" << endl;

	clog << "Load Label Frequence" << endl;
	map<int, double> labelFreq;
	rtn = LoadLabelFreq(labelFreqFile.c_str(), labelFreq);
	CHECK_RTN(rtn);

	vector<pair<int, double> > meshSort;
	for (map<int, double>::iterator it = labelFreq.begin(); it != labelFreq.end(); ++it)
		meshSort.push_back(make_pair(it->first, it->second));
	sort(meshSort.begin(), meshSort.end(), CmpScore);

	vector<int> modelIds;
	modelIds.clear();
	for (size_t i = 0; i < (size_t)modelNum && i < meshSort.size(); ++i)
	{
		string modelFile = modelPath + "/" + intToString(meshSort[i].first) + ".model";
		if (FileExist(modelFile))
		{
			modelIds.push_back(meshSort[i].first);
		}
	}
	clog << modelIds.size() << " Models Available" << endl;

	vector<LhtcDocument*> tokenDocVector;
	tokenDocVector.reserve(tokenDocuments.Size());
	for (map<int, LhtcDocument>::iterator it = tokenDocuments.mLhtcDocuments.begin(); it != tokenDocuments.mLhtcDocuments.end(); it++)
	{
		tokenDocVector.push_back(&(it->second));
	}

	clog << "Prepare for Extract Features" << endl;
	FeatureSet allFeatures;
	allFeatures.mFeatures.resize(tokenDocVector.size());
#pragma omp parallel for schedule(dynamic)
	for (int i = 0; i < (int)tokenDocVector.size(); i++)
	{
		uniGrams.ExtractLhtc(*tokenDocVector[i], allFeatures.mFeatures[i]);
	}
	allFeatures.Normalize();

	vector<int> pmids;
	for (map<int, LhtcDocument>::iterator it = tokenDocuments.mLhtcDocuments.begin(); it != tokenDocuments.mLhtcDocuments.end(); ++it)
		pmids.push_back(it->first);

	if (pmids.size() != (size_t)allFeatures.Size())
	{
		clog << "Error: pmids.size != allFeatures.size" << endl;
		return -1;
	}

	//clog << "Free Memory" << endl;
	//tokenCitations.~TokenCitationSet();

	FILE * outScoreFile = fopen(scoreFilePath.c_str(), "wb");
	if (outScoreFile == NULL)
		return -1;
	clog << "Start Predict" << endl;
	int numThreads = omp_get_num_procs();
	omp_set_num_threads(numThreads);
	rtn = Write(outScoreFile, modelIds.size());//(size_t)modelIds.size()
	CHECK_RTN(rtn);
	for (unsigned int k = 0; k < modelIds.size(); k++)
	{
		if ((k & 255) == 0)
		{
			clog << "LOG : Working for model " << modelIds[k] << endl;
		}
		string modelFile = modelPath + "/" + intToString(modelIds[k]) + ".model";
		LinearMachine linearMachine;
		rtn = linearMachine.Load(modelFile);
		CHECK_RTN(rtn);
		pair<int, vector<pair<int, double>>> modelScore;
		modelScore.first = modelIds[k];
		modelScore.second.resize(allFeatures.Size());
#pragma omp parallel for schedule(dynamic)
		for (int i = 0; i < allFeatures.Size(); i++)
		{
			double tmpScore;
			modelScore.second[i].first = pmids[i];
			linearMachine.Predict(allFeatures[i], modelScore.second[i].second);
		}
		rtn = Write(outScoreFile, modelScore);
		CHECK_RTN(rtn);
	}
	fclose(outScoreFile);
	outScoreFile = NULL;
	clog << "Save Complete" << endl;
	return 0;
}