graphmod::Instances from_lines(string file_name, vector<string> stopword_list, unsigned int min_size){ Instances instances; set<string> stopwords; for(string w: stopword_list){ stopwords.insert(w); } namespace io = boost::iostreams; ifstream file(file_name.c_str(), ios_base::in); io::filtering_stream<io::input> in; in.push(file); string line; boost::regex expr("\\s+", boost::regex::perl); while(in){ getline(in, line); map<string, vector<string> > instance; boost::sregex_token_iterator t1(line.begin(), line.end(), expr, -1); boost::sregex_token_iterator t2; while(t1 != t2){ string value = *t1; for(auto& c: value){ c = tolower(c); } if(stopwords.count(value) == 0 and value.size() > min_size){ instance["token"].push_back(value); } t1++; } if(instance["token"].size() > 0){ instances.add(instance); } } return instances; }
graphmod::Instances from_conll(vector<string> file_names, vector<string> _keep_verbs, int window, unsigned int limit=0, unsigned int per_verb_limit=0){ Instances instances; set<string> keep_verbs; for(string verb: _keep_verbs){ keep_verbs.insert(verb); } std::map<std::string, unsigned int> verb_counts; ConllLoader sentences(file_names); //boost::regex expr("\\n\\s*\\n", boost::regex::perl); //boost::regex_iterator //int total = 0; while(not sentences.eof() and (limit == 0 or instances.size() < limit)){ auto cs = sentences.next(); for(ConllWord cw: cs){ if(cw.get_fine_tag()[0] == 'V' and (keep_verbs.count(cw.get_lemma()) > 0 or keep_verbs.size() == 0)){ map<string, vector<string> > instance; int verb_index = cw.get_index(); instance["verb"] = {cw.get_lemma()}; if(per_verb_limit > 0 and verb_counts[cw.get_lemma()] > per_verb_limit){ continue; } verb_counts[cw.get_lemma()]++; instance["verb_tag"] = {cw.get_fine_tag()}; instance["tag"].resize(0); instance["gr"].resize(0); instance["lemma"].resize(0); if(window > 0){ for(ConllWord ow: cs.get_near(cw, window)){ string tag = ow.get_fine_tag(); instance["tag"].push_back(ow.get_fine_tag()); instance["lemma"].push_back(ow.get_lemma()); stringstream ss; ss << ow.get_index() - verb_index; instance["gr"].push_back(ss.str()); } } else{ //instance["relation"].resize(0); for(ConllWord ow: cs.get_related(cw)){ stringstream ss; string tag = ow.get_fine_tag(), gr = ow.get_relation(); if(function_tag(tag[0]) == true){ ss << gr << "(" << tag << "-" << ow.get_lemma() << "," << cw.get_fine_tag() << ")"; } else{ ss << gr << "(" << tag << "," << cw.get_fine_tag() << ")"; } instance["tag"].push_back(tag); //instance["gr"].push_back(ss.str()); instance["gr"].push_back(gr); instance["lemma"].push_back(ow.get_lemma()); } } instances.add(instance); } } } return instances; //cout << total << endl; }
Instances *ThresholdCurve::getCurve(std::vector<Prediction*> predictions, const int classIndex) { if ((predictions.size() == 0) || ((static_cast<NominalPrediction*>(predictions.at(0)))->distribution().size() <= classIndex)) { return nullptr; } double totPos = 0, totNeg = 0; double_array probs = getProbabilities(predictions, classIndex); // Get distribution of positive/negatives for (int i = 0; i < probs.size(); i++) { NominalPrediction *pred = static_cast<NominalPrediction*>(predictions.at(i)); if (pred->actual() == Prediction::MISSING_VALUE) { std::cout << " Skipping prediction with missing class value"; continue; } if (pred->weight() < 0) { std::cout << " Skipping prediction with negative weight"; continue; } if (pred->actual() == classIndex) { totPos += pred->weight(); } else { totNeg += pred->weight(); } } Instances *insts = makeHeader(); int_array sorted = Utils::Sort(probs); TwoClassStats *tc = new TwoClassStats(totPos, totNeg, 0, 0); double threshold = 0; double cumulativePos = 0; double cumulativeNeg = 0; for (int i = 0; i < sorted.size(); i++) { if ((i == 0) || (probs[sorted[i]] > threshold)) { tc->setTruePositive(tc->getTruePositive() - cumulativePos); tc->setFalseNegative(tc->getFalseNegative() + cumulativePos); tc->setFalsePositive(tc->getFalsePositive() - cumulativeNeg); tc->setTrueNegative(tc->getTrueNegative() + cumulativeNeg); threshold = probs[sorted[i]]; insts->add(*makeInstance(tc, threshold)); cumulativePos = 0; cumulativeNeg = 0; if (i == sorted.size() - 1) { break; } } NominalPrediction *pred = static_cast<NominalPrediction*>(predictions.at(sorted[i])); if (pred->actual() == Prediction::MISSING_VALUE) { std::cout << " Skipping prediction with missing class value"; continue; } if (pred->weight() < 0) { std::cout << " Skipping prediction with negative weight"; continue; } if (pred->actual() == classIndex) { cumulativePos += pred->weight(); } else { cumulativeNeg += pred->weight(); } } // make sure a zero point gets into the curve if (tc->getFalseNegative() != totPos || tc->getTrueNegative() != totNeg) { tc = new TwoClassStats(0, 0, totNeg, totPos); threshold = probs[sorted[sorted.size() - 1]] + 10e-6; insts->add(*makeInstance(tc, threshold)); } return insts; }