Examples<EXAMPLE>::Examples(const Examples<EXAMPLE2>& e) { Debug::log(2) << "Examples<" << EXAMPLE::name() << ">::Examples(Examples<" << EXAMPLE2::name() << "> [" << e.size() << "])...\n"; Debug::log(3) << stats::resource_usage() << "\n"; this->reserve(e.size()); this->insert(this->begin(), e.begin(), e.end()); assert(this->size() == e.size()); Debug::log(2) << "...Examples<" << EXAMPLE::name() << ">::Examples(Examples<" << EXAMPLE2::name() << "> [" << e.size() << "])\n"; Debug::log(2) << stats::resource_usage() << "\n"; }
void KNN::train(Examples& exs){ TRACE_V(TAG,"train"); //Maybe we didnt calculate this before... stats->calculateIDF(); for(int i = 0; i < exs.getNumberOfNumericalAttibutes(); i++){ maxv[i] = numeric_limits<double>::min(); minv[i] = numeric_limits<double>::max(); } for(ExampleIterator e = exs.getBegin(); e != exs.getEnd(); e++){ vector<string> textTokens = (e)->getTextTokens(); vector<int> textFrequencyTokens = (e)->getTextFrequency(); string exampleClass = (e)->getClass(); string eId = (e)->getId(); double docSize = 0.0; // cout<<" Tokens categoricos = " << tokens.size() << endl; for(unsigned int i = 3; i < textTokens.size(); i++){ int tf = textFrequencyTokens[i-3]; string termId = textTokens[i]; double tfidf = tf * stats->getIDF(termId); docSize += (tfidf * tfidf); docWeighted dw(eId, tfidf); termDocWset[termId].insert(dw); } vector<double> numTokens = (e)->getNumericalTokens(); for(unsigned int i = 0; i < numTokens.size(); i++){ if(greaterThan(numTokens[i], maxv[i])){ maxv[i] = numTokens[i]; } if(lesserThan(numTokens[i], minv[i])){ minv[i] = numTokens[i]; } } exNumTrain[eId] = numTokens; exCatTrain[eId] = (e)->getCategoricalTokens(); docTrainSizes[eId] = docSize; } }
/// Sanity check that the initial Weights computed from exmpls /// match those read from the hypothesis file in this Tree. /// \todo It would be much more efficient if we move this to /// Ensemble::set_confidences. template<typename EXAMPLE> void Tree::verify_initial_weights(const Examples<EXAMPLE>& exmpls) const { hash_map<NodeID, Weights> initial_weights; for(typename Examples<EXAMPLE>::const_iterator e = exmpls.begin(); e != exmpls.end(); e++) { const Node& n = *this->find_leaf(*e); initial_weights[n.id()].add(e->initial_weight(), e->is_correct()); } for (vector<Node>::const_iterator n = nodes.begin(); n != nodes.end(); n++) { if (!n->is_leaf()) continue; // assert(fabs(n->initial_weights().pos()() - initial_weights[n->id()].pos()()) < parameter::small_epsilon()); // assert(fabs(n->initial_weights().neg()() - initial_weights[n->id()].neg()()) < parameter::small_epsilon()); if (fabs(n->initial_weights().pos()() - initial_weights[n->id()].pos()()) > parameter::small_epsilon()) { ostringstream o; o << "|read initial posweight - calc initial posweight| = " << fabs(n->initial_weights().pos()() - initial_weights[n->id()].pos()()); Debug::warning(__FILE__, __LINE__, o.str()); } if (fabs(n->initial_weights().neg()() - initial_weights[n->id()].neg()()) > parameter::small_epsilon()) { ostringstream o; o << "|read initial negweight - calc initial negweight| = " << fabs(n->initial_weights().neg()() - initial_weights[n->id()].neg()()); Debug::warning(__FILE__, __LINE__, o.str()); } } }
int main(int argc, char** argv) { string filename = argv[1], _eps = argv[2]; double eps = stod(_eps); ifstream fin(filename); string line; Examples e; Appear app; while (getline(fin, line)) { istringstream sin(line); int ans; sin >> ans; e.PB(Example(ans)); int key; char c; Feature val; while (sin >> key >> c >> val) { e[e.size() - 1].feat[key] = val; app.insert(key); } } exPtrs p; for (int i = 0; i < e.size(); i++) p.PB(&e[i]); Tree *root = decide(p, eps, app); cout << "int tree_predict(double *attr) {" << endl; print(root, 1); cout << "}" << endl; delete root; return 0; }
void KNN::test(Examples& exs){ TRACE_V(TAG,"test"); //Statistics: map<string,unsigned long long> classHits; map<string,unsigned long long> classMiss; map<string,unsigned long long> mappedDocs; map<string,unsigned long long> docsPerClass; int numExamples = 0; for(ExampleIterator it = exs.getBegin(); it != exs.getEnd(); it++){ numExamples++; if(numExamples % 100 == 0) cout<<"Evaluated: " << numExamples<<endl; Example ex = *it; vector<string> textTokens = ex.getTextTokens(); vector<int> textFreqTokens = ex.getTextFrequency(); vector<double> numTokens = ex.getNumericalTokens(); vector<string> catTokens = ex.getCategoricalTokens(); string eId = ex.getId(); string classId = ex.getClass(); map<string, double> examplesTestSize; //credibility to each class if((usingKNNOptimize && !valuesSaved ) || !usingKNNOptimize){ for(unsigned int i = 3; i < textTokens.size(); i++){ string termId = textTokens[i]; int tf = textFreqTokens[i-3]; for(set<string>::iterator classIt = stats->getClasses().begin(); classIt != stats->getClasses().end(); classIt++) { double tfidf = tf * getContentCredibility(termId, *classIt); examplesTestSize[*classIt] += (tfidf * tfidf); } } } map<string, double> similarity; if(usingKNNOptimize && valuesSaved){ similarity = saveValues[eId]; } else{ for(unsigned int i = 3; i < textTokens.size();i++){ string termId = textTokens[i]; int tf = textFreqTokens[1-3]; for(set<docWeighted, docWeightedCmp>::iterator termIt = termDocWset[termId].begin(); termIt != termDocWset[termId].end(); termIt++){ string trainClass = stats-> getTrainClass(termIt->docId); double trainDocSize = docTrainSizes[termIt->docId]; double trainTermWeight = termIt->weight; double testTermWeight = tf * getContentCredibility(termId, trainClass); similarity[termIt->docId] += ( - ( trainTermWeight / sqrt(trainDocSize) * testTermWeight / sqrt(examplesTestSize[trainClass]) )); // cout<<"sim = " << similarity[termIt->docId] <<endl; } } //numerical KNN for(map<string, vector<double> >::iterator trainIt = exNumTrain.begin(); trainIt != exNumTrain.end(); trainIt++){ double dist = 0.0; for(unsigned int i = 0; i < numTokens.size(); i++){ double a = minMaxNorm(numTokens[i],i); double b = minMaxNorm(exNumTrain[trainIt->first][i],i); double val = (a-b)*(a-b); //double val = (numTokens[i] - exNumTrain[trainIt->first][i]) * ( numTokens[i] - exNumTrain[trainIt->first][i]); // cout<<numTokens[i] << " - " <<exNumTrain[trainIt->first][i] <<endl; // cout<<"a = " << a << " b = " << b << " val =" << val<<endl; if( greaterThan(dist + val, numeric_limits<double>::max())){ // cerr<<"OOOOOOOOOOOOOOOPA!!!"<<endl; // exit(0); dist = numeric_limits<double>::max() - 1.0; break; } dist += val; // cout<<"dist =" << dist<<endl; } similarity[trainIt->first] += dist; } //categorical KNN for(map<string, vector<string> >::iterator trainIt = exCatTrain.begin(); trainIt != exCatTrain.end(); trainIt++){ double dist = 0.0; for(unsigned int i = 0; i < catTokens.size(); i++){ string trainTok = exCatTrain[trainIt->first][i]; string testTok = catTokens[i]; double catCred = getCategoricalCredibility(i, testTok, stats->getTrainClass(trainIt->first)); // cout<<"catCred = " <<catCred<<endl; // cout<<" i = " << i << "teste = " << testTok<<" treino = " << trainTok<<endl; if(trainTok != testTok){ // dist+= 1.0/(catCred+ 1.0) + 1.0; dist+= 1.0/(catCred+ 1.0); // cout<<"dist = " << dist<<endl; } } similarity[trainIt->first] += dist; } // cout<<"class = " << classId << " doc = " << trainIt->first<< " docClass = " << stats->getTrainClass(trainIt->first) << " dist="<<dist<< " 1/dist = " <<1.0/dist<< " sqrt = "<<sqrt(dist)<<endl; } if(!valuesSaved && usingKNNOptimize){ saveValues[eId] = similarity; } //sim of each example in test set set<docWeighted, docWeightedCmp> sim; for(map<string, double>::iterator testIt = similarity.begin(); testIt != similarity.end(); testIt++){ //calculating graph credibility....if so vector<double> graphsCreds(graphsCredibility.size()); double similarityValue = testIt->second; // cout<< " eid = " << eId << " eclass = " << classId << " traindocclass = " << stats->getTrainClass(testIt->first) << " similarit = " << similarityValue<< endl; for(unsigned int g = 0 ; g < graphsCredibility.size(); g++){ double gsim = getGraphCredibility(g, eId, stats->getTrainClass(testIt->first)); similarityValue /= (0.5+gsim); } //never change this, it is necessary docWeighted dw(testIt->first, similarityValue); sim.insert(dw); } string predictedLabel = getPredictedClass(sim); computeConfusionMatrix(classId, predictedLabel); // if(io->usingPredictionsFile) savePrediction(eId, classId, predictedLabel); if(predictedLabel == classId){ classHits[classId] ++; } else{ classMiss[classId]++; } mappedDocs[predictedLabel]++; docsPerClass[classId]++; } if(valuesSaved == false){ valuesSaved = true; } calculateF1(classHits,classMiss,docsPerClass, mappedDocs); // showConfusionMatrix(); }
/// Weight the leaves of this tree, and update the Example weights. /// \todo Backprune splits that don't reduce loss, and backprune leaves /// that don't have enough weight/exmpls to meet the initial splitting /// criteria /// \todo Weight the internal nodes too, for debugging purposes? template<typename EXAMPLE> void Tree::weight_leaves_and_update_examples(Examples<EXAMPLE>& exmpls) { Debug::log(1) << "\nTree::weight_leaves_and_update_examples(Examples<" << EXAMPLE::name() << ">)...\n"; vector<Node>::iterator n; Double orig_total_weight; hash_map<NodeID, set<ID<Sentence> > > sentences; hash_map<NodeID, ExamplePtrs<EXAMPLE> > leaves; unsigned totcnt = 0; for(typename Examples<EXAMPLE>::iterator ex = exmpls.begin(); ex != exmpls.end(); ex++) { // Find the node that e falls into. const Node* n = this->find_leaf(*ex); assert(n->is_leaf()); assert(n->id() != NO_NODE); sentences[n->id()].insert(ex->sentence()); sentences[NO_NODE].insert(ex->sentence()); leaves[n->id()].push_back(&(*ex)); // FIXME: This won't work if there's noise orig_total_weight += ex->weight(); totcnt++; if (totcnt % 100000 == 0) Debug::log(3) << "\tProcessed " << totcnt << " examples in Tree::weight_leaves()\n"; if (totcnt % 10000 == 0) Debug::log(4) << "\tProcessed " << totcnt << " examples in Tree::weight_leaves()\n"; } Debug::log(2) << "Done processing " << totcnt << " examples in Tree::weight_leaves()\n"; // Compute the confidence for each leaf. unsigned leafcnt = 0; unsigned sentence_cnt = 0; unsigned example_cnt = 0; for (n = nodes.begin(); n != nodes.end(); n++) { if (n->is_leaf()) { assert(leaves.find(n->id()) != leaves.end()); const ExamplePtrs<EXAMPLE>& leaf_examples = leaves.find(n->id())->second; double orig_unpenalized_loss = leaf_examples.unpenalized_loss(); Weights initial_weight = leaf_examples.initial_weight(); example_cnt += leaf_examples.size(); n->set_confidence_and_initial_weights(leaf_examples); // Add the leaf confidence to the leaf Examples. leaves.find(n->id())->second.add_confidence(n->confidence()); // Update the confidence of this leaf's Example%s. double unpenalized_loss = leaf_examples.unpenalized_loss(); leafcnt++; // Debug::log(2) << "Weighted leaf:\n"; // Debug::log(2) << n->to_string("\t"); Debug::log(2) << n->to_string(); assert(sentences.find(n->id()) != sentences.end()); sentence_cnt += sentences.find(n->id())->second.size(); Debug::log(2) << "\t" << leaf_examples.size() << " examples from "; Debug::log(2) << sentences.find(n->id())->second.size() << " different sentences\n"; double penalty = n->penalty(); Debug::log(2) << "\tloss = " << unpenalized_loss + penalty << " = " << unpenalized_loss << " (unpenalized loss) + " << penalty << " (penalty)"; if (n->confidence() != 0) Debug::log(2) << " (conf=0 loss was " << orig_unpenalized_loss << ")"; Debug::log(2) << "\n"; } } assert(example_cnt == exmpls.size()); Debug::log(2) << "Examples from " << sentences[NO_NODE].size() << " different sentences.\n"; Debug::log(2) << "Done weighting " << leafcnt << " leaves in Tree::weight_leaves()\n"; Debug::log(2) << stats::resource_usage() << "\n"; // WRITEME: Backprune splits that don't reduce loss Debug::log(1) << "...Tree::weight_leaves_and_update_examples(Examples<" << EXAMPLE::name() << ">)\n"; }