Пример #1
0
Examples<EXAMPLE>::Examples(const Examples<EXAMPLE2>& e) {
	Debug::log(2) << "Examples<" << EXAMPLE::name() << ">::Examples(Examples<" << EXAMPLE2::name() << "> [" << e.size() << "])...\n";
	Debug::log(3) << stats::resource_usage() << "\n";
	this->reserve(e.size());
	this->insert(this->begin(), e.begin(), e.end());
	assert(this->size() == e.size());
	Debug::log(2) << "...Examples<" << EXAMPLE::name() << ">::Examples(Examples<" << EXAMPLE2::name() << "> [" << e.size() << "])\n";
	Debug::log(2) << stats::resource_usage() << "\n";
}
Пример #2
0
void KNN::train(Examples& exs){

	TRACE_V(TAG,"train");
    
    //Maybe we didnt calculate this before...
    stats->calculateIDF();
 
    for(int i = 0; i < exs.getNumberOfNumericalAttibutes(); i++){
        maxv[i] = numeric_limits<double>::min();
        minv[i] = numeric_limits<double>::max();
    }
    
	for(ExampleIterator e = exs.getBegin(); e != exs.getEnd(); e++){
   
        vector<string> textTokens = (e)->getTextTokens();
        vector<int> textFrequencyTokens = (e)->getTextFrequency();
		string exampleClass = (e)->getClass();
        string eId = (e)->getId();
        double docSize = 0.0;

//      cout<<" Tokens categoricos  =  " << tokens.size() << endl;
		for(unsigned int i = 3; i < textTokens.size(); i++){
			int tf = textFrequencyTokens[i-3];
			string termId = textTokens[i];
            
            double tfidf = tf * stats->getIDF(termId);

            docSize += (tfidf * tfidf);

            docWeighted dw(eId, tfidf);
            termDocWset[termId].insert(dw);
		}
        
        vector<double> numTokens = (e)->getNumericalTokens();
       
        for(unsigned int i = 0; i < numTokens.size(); i++){
            if(greaterThan(numTokens[i], maxv[i])){
                maxv[i] = numTokens[i];
            }
            if(lesserThan(numTokens[i], minv[i])){
                minv[i] = numTokens[i];
            }
        }

        exNumTrain[eId] = numTokens;
        exCatTrain[eId] = (e)->getCategoricalTokens();
        
        docTrainSizes[eId] = docSize;
    }

}
Пример #3
0
/// Sanity check that the initial Weights computed from exmpls
/// match those read from the hypothesis file in this Tree.
/// \todo It would be much more efficient if we move this to
/// Ensemble::set_confidences.
template<typename EXAMPLE> void Tree::verify_initial_weights(const Examples<EXAMPLE>& exmpls) const {
	hash_map<NodeID, Weights> initial_weights;
	for(typename Examples<EXAMPLE>::const_iterator e = exmpls.begin(); e != exmpls.end(); e++) {
		const Node& n = *this->find_leaf(*e);
		initial_weights[n.id()].add(e->initial_weight(), e->is_correct());
	}
	for (vector<Node>::const_iterator n = nodes.begin(); n != nodes.end(); n++) {
		if (!n->is_leaf()) continue;
//		assert(fabs(n->initial_weights().pos()() - initial_weights[n->id()].pos()()) < parameter::small_epsilon());
//		assert(fabs(n->initial_weights().neg()() - initial_weights[n->id()].neg()()) < parameter::small_epsilon());
		if (fabs(n->initial_weights().pos()() - initial_weights[n->id()].pos()()) > parameter::small_epsilon()) {
			ostringstream o;
			o << "|read initial posweight - calc initial posweight| = " << fabs(n->initial_weights().pos()() - initial_weights[n->id()].pos()());
			Debug::warning(__FILE__, __LINE__, o.str());
		}
		if (fabs(n->initial_weights().neg()() - initial_weights[n->id()].neg()()) > parameter::small_epsilon()) {
			ostringstream o;
			o << "|read initial negweight - calc initial negweight| = " << fabs(n->initial_weights().neg()() - initial_weights[n->id()].neg()());
			Debug::warning(__FILE__, __LINE__, o.str());
		}
	}
}
Пример #4
0
int main(int argc, char** argv) {
	string filename = argv[1], _eps = argv[2];
	double eps = stod(_eps);
	ifstream fin(filename);
	string line;
	Examples e;
	Appear app;
	while (getline(fin, line)) {
		istringstream sin(line);
		int ans;
		sin >> ans;
		e.PB(Example(ans));
		
		int key;
		char c;
		Feature val;
		while (sin >> key >> c >> val) {
			e[e.size() - 1].feat[key] = val;
			app.insert(key);
		}
	}
	
	exPtrs p;
	for (int i = 0; i < e.size(); i++)
		p.PB(&e[i]);

	Tree *root = decide(p, eps, app);
	
	cout << "int tree_predict(double *attr) {" << endl;
	print(root, 1);
	cout << "}" << endl;
	
	delete root;
	
	return 0;
}
Пример #5
0
void KNN::test(Examples& exs){

    TRACE_V(TAG,"test");

    //Statistics:
    map<string,unsigned long long> classHits;
    map<string,unsigned long long> classMiss;
    map<string,unsigned long long> mappedDocs;
    map<string,unsigned long long> docsPerClass;
    
    int numExamples = 0;
    for(ExampleIterator it = exs.getBegin(); it != exs.getEnd(); it++){
        numExamples++;
        if(numExamples % 100 == 0)
            cout<<"Evaluated: " << numExamples<<endl;

        Example ex = *it;


        vector<string> textTokens = ex.getTextTokens();	
        vector<int>    textFreqTokens = ex.getTextFrequency();	
        vector<double> numTokens = ex.getNumericalTokens();
        vector<string> catTokens = ex.getCategoricalTokens();

        string eId = ex.getId();
        string classId = ex.getClass();
        
        map<string, double> examplesTestSize;
        //credibility to each class
        if((usingKNNOptimize && !valuesSaved )  || !usingKNNOptimize){
            for(unsigned int i = 3; i < textTokens.size(); i++){
                string termId = textTokens[i];
                int tf = textFreqTokens[i-3];

                for(set<string>::iterator classIt = stats->getClasses().begin(); classIt != stats->getClasses().end(); classIt++) {
                    double tfidf = tf * getContentCredibility(termId, *classIt);
                    examplesTestSize[*classIt] += (tfidf * tfidf);
                }
            }
        }
        map<string, double> similarity;

        if(usingKNNOptimize && valuesSaved){
            similarity = saveValues[eId];
        }
        else{
            for(unsigned int i = 3; i < textTokens.size();i++){
                string termId = textTokens[i];
                int tf = textFreqTokens[1-3];

                for(set<docWeighted, docWeightedCmp>::iterator termIt = termDocWset[termId].begin(); termIt != termDocWset[termId].end(); termIt++){
                    string trainClass = stats-> getTrainClass(termIt->docId);

                    double trainDocSize = docTrainSizes[termIt->docId];
                    double trainTermWeight = termIt->weight;
                    double testTermWeight = tf * getContentCredibility(termId, trainClass);
                    
                    similarity[termIt->docId] +=  ( - ( trainTermWeight / sqrt(trainDocSize)  * testTermWeight / sqrt(examplesTestSize[trainClass]) ));
//                    cout<<"sim = " << similarity[termIt->docId] <<endl;
                }
            }

            //numerical KNN
            for(map<string, vector<double> >::iterator trainIt  = exNumTrain.begin(); trainIt != exNumTrain.end(); trainIt++){
                double dist = 0.0;

                for(unsigned int i = 0; i < numTokens.size(); i++){
                    double a = minMaxNorm(numTokens[i],i);
                    double b = minMaxNorm(exNumTrain[trainIt->first][i],i);
                    double val = (a-b)*(a-b);
                    //double val = (numTokens[i] - exNumTrain[trainIt->first][i]) * ( numTokens[i] - exNumTrain[trainIt->first][i]);
                    //                    cout<<numTokens[i] << " - " <<exNumTrain[trainIt->first][i] <<endl;
                    //                    cout<<"a = " << a << " b = " << b << " val =" << val<<endl;
                    if( greaterThan(dist + val, numeric_limits<double>::max())){
                        //                        cerr<<"OOOOOOOOOOOOOOOPA!!!"<<endl;
                        //                        exit(0);
                        dist = numeric_limits<double>::max() - 1.0;
                        break;
                    }
                    dist += val;
                    //                    cout<<"dist =" << dist<<endl;
                }
                similarity[trainIt->first] += dist;
            }

            //categorical KNN
            for(map<string, vector<string> >::iterator trainIt  = exCatTrain.begin(); trainIt != exCatTrain.end(); trainIt++){
                double dist = 0.0;

                for(unsigned int i = 0; i < catTokens.size(); i++){
                    string trainTok = exCatTrain[trainIt->first][i];
                    string testTok = catTokens[i];

                    double catCred = getCategoricalCredibility(i, testTok, stats->getTrainClass(trainIt->first));
//                    cout<<"catCred = " <<catCred<<endl;
//                    cout<<" i = " << i << "teste = " << testTok<<" treino = " << trainTok<<endl;
                    if(trainTok != testTok){
//                        dist+= 1.0/(catCred+ 1.0) + 1.0;
                        dist+= 1.0/(catCred+ 1.0);
//                        cout<<"dist = " << dist<<endl;
                    }
                }
                similarity[trainIt->first] += dist;
            }
 //               cout<<"class = " << classId << " doc = " << trainIt->first<< " docClass = " << stats->getTrainClass(trainIt->first) << " dist="<<dist<< " 1/dist = " <<1.0/dist<< " sqrt = "<<sqrt(dist)<<endl;
        }

        if(!valuesSaved && usingKNNOptimize){
            saveValues[eId] = similarity;
        }

        //sim of each example in test set
        set<docWeighted, docWeightedCmp> sim;
        for(map<string, double>::iterator testIt = similarity.begin(); testIt != similarity.end(); testIt++){
            
            //calculating graph credibility....if so
            vector<double> graphsCreds(graphsCredibility.size());
            double similarityValue = testIt->second;

//            cout<< " eid = " << eId << " eclass = " << classId << " traindocclass = " << stats->getTrainClass(testIt->first) << " similarit = " << similarityValue<< endl;

            for(unsigned int g = 0 ; g < graphsCredibility.size(); g++){
                double gsim = getGraphCredibility(g, eId, stats->getTrainClass(testIt->first));
                similarityValue /= (0.5+gsim);
            } 
           
            //never change this, it is necessary
            docWeighted dw(testIt->first, similarityValue);
            sim.insert(dw);
        }
        
        string predictedLabel = getPredictedClass(sim);

        computeConfusionMatrix(classId, predictedLabel);

        //        if(io->usingPredictionsFile)
        savePrediction(eId, classId, predictedLabel);

        if(predictedLabel == classId){
            classHits[classId] ++;			
        }
        else{
            classMiss[classId]++;
        }

        mappedDocs[predictedLabel]++;
        docsPerClass[classId]++;
    }
    if(valuesSaved == false){
        valuesSaved = true;
    }
    calculateF1(classHits,classMiss,docsPerClass, mappedDocs);
//    showConfusionMatrix();
}
Пример #6
0
/// Weight the leaves of this tree, and update the Example weights.
/// \todo Backprune splits that don't reduce loss, and backprune leaves
/// that don't have enough weight/exmpls to meet the initial splitting
/// criteria
/// \todo Weight the internal nodes too, for debugging purposes?
template<typename EXAMPLE> void Tree::weight_leaves_and_update_examples(Examples<EXAMPLE>& exmpls) {
	Debug::log(1) << "\nTree::weight_leaves_and_update_examples(Examples<" << EXAMPLE::name() << ">)...\n";

	vector<Node>::iterator n;
	Double orig_total_weight;

	hash_map<NodeID, set<ID<Sentence> > > sentences;
	hash_map<NodeID, ExamplePtrs<EXAMPLE> > leaves;

	unsigned totcnt = 0;
	for(typename Examples<EXAMPLE>::iterator ex = exmpls.begin(); ex != exmpls.end(); ex++) {
		// Find the node that e falls into.
		const Node* n = this->find_leaf(*ex);
		assert(n->is_leaf());

		assert(n->id() != NO_NODE);
		sentences[n->id()].insert(ex->sentence());
		sentences[NO_NODE].insert(ex->sentence());

		leaves[n->id()].push_back(&(*ex));

		// FIXME: This won't work if there's noise
		orig_total_weight += ex->weight();

		totcnt++;
		if (totcnt % 100000 == 0)
			Debug::log(3) << "\tProcessed " << totcnt << " examples in Tree::weight_leaves()\n";
		if (totcnt % 10000 == 0)
			Debug::log(4) << "\tProcessed " << totcnt << " examples in Tree::weight_leaves()\n";
	}

	Debug::log(2) << "Done processing " << totcnt << " examples in Tree::weight_leaves()\n";

	// Compute the confidence for each leaf.
	unsigned leafcnt = 0;
	unsigned sentence_cnt = 0;
	unsigned example_cnt = 0;
	for (n = nodes.begin(); n != nodes.end(); n++) {
		if (n->is_leaf()) {
			assert(leaves.find(n->id()) != leaves.end());

			const ExamplePtrs<EXAMPLE>& leaf_examples = leaves.find(n->id())->second;
			double orig_unpenalized_loss = leaf_examples.unpenalized_loss();
			Weights initial_weight = leaf_examples.initial_weight();

			example_cnt += leaf_examples.size();
			n->set_confidence_and_initial_weights(leaf_examples);

			// Add the leaf confidence to the leaf Examples.
			leaves.find(n->id())->second.add_confidence(n->confidence());

			// Update the confidence of this leaf's Example%s.
			double unpenalized_loss = leaf_examples.unpenalized_loss();


			leafcnt++;
//			Debug::log(2) << "Weighted leaf:\n";
//			Debug::log(2) << n->to_string("\t");
			Debug::log(2) << n->to_string();
			assert(sentences.find(n->id()) != sentences.end());
			sentence_cnt += sentences.find(n->id())->second.size();
			Debug::log(2) << "\t" << leaf_examples.size() << " examples from ";
			Debug::log(2) << sentences.find(n->id())->second.size() << " different sentences\n";

			double penalty = n->penalty();
			Debug::log(2) << "\tloss = " << unpenalized_loss + penalty << " = " << unpenalized_loss << " (unpenalized loss) + " << penalty << " (penalty)";
			if (n->confidence() != 0) Debug::log(2) << "  (conf=0 loss was " << orig_unpenalized_loss << ")";
			Debug::log(2) << "\n";
		}
	}
	assert(example_cnt == exmpls.size());
	Debug::log(2) << "Examples from " << sentences[NO_NODE].size() << " different sentences.\n";

	Debug::log(2) << "Done weighting " << leafcnt << " leaves in Tree::weight_leaves()\n";
	Debug::log(2) << stats::resource_usage() << "\n";

	// WRITEME: Backprune splits that don't reduce loss

	Debug::log(1) << "...Tree::weight_leaves_and_update_examples(Examples<" << EXAMPLE::name() << ">)\n";
}