Esempio n. 1
0
  graphmod::Instances from_lines(string file_name, vector<string> stopword_list, unsigned int min_size){
    Instances instances;
    set<string> stopwords;
    for(string w: stopword_list){
      stopwords.insert(w);
    }
    namespace io = boost::iostreams;
    ifstream file(file_name.c_str(), ios_base::in);
    io::filtering_stream<io::input> in;
    in.push(file);
    string line;
    boost::regex expr("\\s+", boost::regex::perl);
    while(in){
      getline(in, line);
      map<string, vector<string> > instance;
      boost::sregex_token_iterator t1(line.begin(), line.end(), expr, -1);
      boost::sregex_token_iterator t2;
      while(t1 != t2){
	string value = *t1;
	for(auto& c: value){
	  c = tolower(c);
	}
	if(stopwords.count(value) == 0 and value.size() > min_size){
	  instance["token"].push_back(value);
	}
	t1++;
      }
      if(instance["token"].size() > 0){
	instances.add(instance);
      }
    }
    return instances;
  }
Esempio n. 2
0
  graphmod::Instances from_conll(vector<string> file_names, vector<string> _keep_verbs, int window, unsigned int limit=0, unsigned int per_verb_limit=0){
    Instances instances;
    set<string> keep_verbs;
    for(string verb: _keep_verbs){
      keep_verbs.insert(verb);
    }
    std::map<std::string, unsigned int> verb_counts;
    ConllLoader sentences(file_names);
    //boost::regex expr("\\n\\s*\\n", boost::regex::perl);
    //boost::regex_iterator 
    //int total = 0;
    while(not sentences.eof() and (limit == 0 or instances.size() < limit)){
      auto cs = sentences.next();
      for(ConllWord cw: cs){
	if(cw.get_fine_tag()[0] == 'V' and (keep_verbs.count(cw.get_lemma()) > 0 or keep_verbs.size() == 0)){
	  map<string, vector<string> > instance;
	  int verb_index = cw.get_index();
	  instance["verb"] = {cw.get_lemma()};
	  if(per_verb_limit > 0 and verb_counts[cw.get_lemma()] > per_verb_limit){
	    continue;
	  }
	  verb_counts[cw.get_lemma()]++;
	  instance["verb_tag"] = {cw.get_fine_tag()};
	  instance["tag"].resize(0);
	  instance["gr"].resize(0);
	  instance["lemma"].resize(0);
	  if(window > 0){
	    for(ConllWord ow: cs.get_near(cw, window)){
	      string tag = ow.get_fine_tag();
	      instance["tag"].push_back(ow.get_fine_tag());
	      instance["lemma"].push_back(ow.get_lemma());
	      stringstream ss;
	      ss << ow.get_index() - verb_index;
	      instance["gr"].push_back(ss.str());
	    }
	  }
	  else{
	    //instance["relation"].resize(0);
	    for(ConllWord ow: cs.get_related(cw)){
	      stringstream ss;
	      string tag = ow.get_fine_tag(), gr = ow.get_relation();
	      if(function_tag(tag[0]) == true){
		ss << gr << "(" << tag << "-" << ow.get_lemma() << "," << cw.get_fine_tag() << ")";
	      }
	      else{
		ss << gr << "(" << tag << "," << cw.get_fine_tag() << ")";
	      }

	      instance["tag"].push_back(tag);
	      //instance["gr"].push_back(ss.str());
	      instance["gr"].push_back(gr);
	      instance["lemma"].push_back(ow.get_lemma());
	    }
	  }
	  instances.add(instance);
	}
      }
    }
    return instances;
    //cout << total << endl;
  }
Esempio n. 3
0
Instances *ThresholdCurve::getCurve(std::vector<Prediction*> predictions, const int classIndex) {

    if ((predictions.size() == 0) || ((static_cast<NominalPrediction*>(predictions.at(0)))->distribution().size() <= classIndex)) {
        return nullptr;
    }

    double totPos = 0, totNeg = 0;
    double_array probs = getProbabilities(predictions, classIndex);

    // Get distribution of positive/negatives
    for (int i = 0; i < probs.size(); i++) {
        NominalPrediction *pred = static_cast<NominalPrediction*>(predictions.at(i));
        if (pred->actual() == Prediction::MISSING_VALUE) {
            std::cout << " Skipping prediction with missing class value";
            continue;
        }
        if (pred->weight() < 0) {
            std::cout << " Skipping prediction with negative weight";
            continue;
        }
        if (pred->actual() == classIndex) {
            totPos += pred->weight();
        }
        else {
            totNeg += pred->weight();
        }
    }

    Instances *insts = makeHeader();
    int_array sorted = Utils::Sort(probs);
    TwoClassStats *tc = new TwoClassStats(totPos, totNeg, 0, 0);
    double threshold = 0;
    double cumulativePos = 0;
    double cumulativeNeg = 0;
    for (int i = 0; i < sorted.size(); i++) {

        if ((i == 0) || (probs[sorted[i]] > threshold)) {
            tc->setTruePositive(tc->getTruePositive() - cumulativePos);
            tc->setFalseNegative(tc->getFalseNegative() + cumulativePos);
            tc->setFalsePositive(tc->getFalsePositive() - cumulativeNeg);
            tc->setTrueNegative(tc->getTrueNegative() + cumulativeNeg);
            threshold = probs[sorted[i]];
            insts->add(*makeInstance(tc, threshold));
            cumulativePos = 0;
            cumulativeNeg = 0;
            if (i == sorted.size() - 1) {
                break;
            }
        }

        NominalPrediction *pred = static_cast<NominalPrediction*>(predictions.at(sorted[i]));

        if (pred->actual() == Prediction::MISSING_VALUE) {
            std::cout << " Skipping prediction with missing class value";
            continue;
        }
        if (pred->weight() < 0) {
            std::cout << " Skipping prediction with negative weight";
            continue;
        }
        if (pred->actual() == classIndex) {
            cumulativePos += pred->weight();
        }
        else {
            cumulativeNeg += pred->weight();
        }
    }

    // make sure a zero point gets into the curve
    if (tc->getFalseNegative() != totPos || tc->getTrueNegative() != totNeg) {
        tc = new TwoClassStats(0, 0, totNeg, totPos);
        threshold = probs[sorted[sorted.size() - 1]] + 10e-6;
        insts->add(*makeInstance(tc, threshold));
    }

    return insts;
}