示例#1
0
void train(ME_Model & model, const string & filename)
{
  ifstream ifile(filename.c_str());
  
  if (!ifile) {
    cerr << "error: cannot open " << filename << endl; 
    exit(1); 
  }

  string line;
  int n = 0;
  while (getline(ifile, line)) {
    vector<Token> vs = read_line(line);
    for (int j = 0; j < (int)vs.size(); j++) {
      ME_Sample mes = sample(vs, j);
      model.add_training_sample(mes);
    }
    if (n++ > 10000) break;
  }    

  model.use_l1_regularizer(1.0);
  //  model.use_l2_regularizer(1.0);
  //  model.use_SGD();
  model.set_heldout(100);
  model.train();
  model.save_to_file("model");
}
示例#2
0
int
bidir_train(const vector<Sentence> & vs, int para)
{
  //  vme.clear();
  //  vme.resize(16);

  for (int t = 0; t < 16; t++) {
    if (t != 15 && t != 0) continue;
  //  for (int t = 15; t >= 0; t--) {
    vector<ME_Sample> train;

    if (para != -1 && t % 4 != para) continue;
    //    if (t % 2 == 1) continue;
    cerr << "type = " << t << endl;
    cerr << "extracting features...";
    int n = 0;
    for (vector<Sentence>::const_iterator i = vs.begin(); i != vs.end(); i++) {
      const Sentence & s = *i;
      for (int j = 0; j < s.size(); j++) {

        string pos_left1 = "BOS", pos_left2 = "BOS2";
        if (j >= 1) pos_left1 = s[j-1].pos;
        if (j >= 2) pos_left2 = s[j-2].pos;
        string pos_right1 = "EOS", pos_right2 = "EOS2";
        if (j <= int(s.size()) - 2) pos_right1 = s[j+1].pos;
        if (j <= int(s.size()) - 3) pos_right2 = s[j+2].pos;
        if ( (t & 0x8) == 0 ) pos_left2 = "";
        if ( (t & 0x4) == 0 ) pos_left1 = "";
        if ( (t & 0x2) == 0 ) pos_right1 = "";
        if ( (t & 0x1) == 0 ) pos_right2 = "";

        train.push_back(mesample(s, j, pos_left2, pos_left1, pos_right1, pos_right2));
      }
      //      if (n++ > 1000) break;
    }
    cerr << "done" << endl;

    ME_Model m;
    //    m.set_heldout(1000,0);
    //    m.train(train, 2, 1000, 0);
    m.train(train, 2, 0, 1);
    char buf[1000];
    sprintf(buf, "model.bidir.%d", t);
    m.save_to_file(buf);
    
  }

}
示例#3
0
// Print weights
void print_weights() {
	list< pair< pair<string, string>, double > > fl;
	model.get_features(fl);
	for (list< pair< pair<string, string>, double> >::const_iterator i = fl.begin(); i != fl.end(); i++) {
		Rprintf("%10.3f  %-10s %s\n", i->second, i->first.first.c_str(), i->first.second.c_str());
	}
}
示例#4
0
// Export weights
vector< vector<string> > export_weights() {
	list< pair< pair<string, string>, double > > fl;
	model.get_features(fl);
	
	vector<string> value1;
	vector<string> value2;
	vector<string> value3;
	for (list< pair< pair<string, string>, double> >::const_iterator i = fl.begin(); i != fl.end(); i++) {
		stringstream write_weights1;
		write_weights1 << setprecision(3) << setw(10) << i->second;
		string weights1 = write_weights1.str();
		value1.push_back(weights1);
		
		stringstream write_weights2;
		write_weights2 << left << setw(10) << i->first.first.c_str();
		string weights2 = write_weights2.str();
		value2.push_back(weights2);
		
		stringstream write_weights3;
		write_weights3 << i->first.second.c_str();
		string weights3 = write_weights3.str();
		value3.push_back(weights3);
	}
	
	vector< vector<string> > results;
	results.push_back(value1);
	results.push_back(value2);
	results.push_back(value3);
	
	return results;
}
示例#5
0
void add_feat_to_model(ME_Model & model, double *feat, int nfeat = 7)
{
    char *label = label_sample(feat); 
    ME_Sample samp(label);
    for(int i=0; i<nfeat; i++) {
        char *mark = (char*) malloc(10 * sizeof(char));
        sprintf(mark,"feat%d",i);
        samp.add_feature(mark,feat[i]);
    }
    model.add_training_sample(samp);
}
示例#6
0
文件: train.cpp 项目: 153370771/ltp
void train(ME_Model & model, const string & input, const string & model_path)
{
    ifstream ifile(input.c_str());

    if (!ifile)
    {
        cerr << "error: cannot open " << input << endl; 
        exit(1);
    }

    string line;
    while (getline(ifile, line))
    {
        vector<string> vs = split(line);
        ME_Sample mes(vs, true);
        model.add_training_sample(mes);
    }

    model.train();
    model.save(model_path);
}
示例#7
0
void add_samples(int nrows, int ncols, vector<string> samples, vector<double> ia, vector<string> ja, vector<double> ra) {
	new_model();
	for (int i=0; i < nrows; i++) { // for each document
		//Rprintf("Document %d\n",i); // debug output
		ME_Sample newSample(samples[i]); // create new sample for code
		for (int j=ia[i]-1; j < ia[i+1]-1; j++) { // for each feature
			//Rprintf("Feature %s\n",features[j].c_str());
			newSample.add_feature(ja[j],ra[j]);
		}
		model.add_training_sample(newSample);
	}
}
示例#8
0
RcppExport SEXP classify_samples(int nrows, int ncols, vector<double> ia, vector<string> ja, vector<double> ra, string model_data) {
	new_model();
	model.load_from_string(model_data);
	vector<string> results;
	vector<string> probability_names;
	
	NumericMatrix probability_matrix(nrows,model.num_classes());
	
	for (int i=0; i < nrows; i++) { // for each document
		//Rprintf("Document %d\n",i); // debug output
		ME_Sample newSample; // create new sample for code
		for (int j=ia[i]-1; j < ia[i+1]-1; j++) { // for each feature
			newSample.add_feature(ja[j],ra[j]);
		}
		
		vector<double> prob = model.classify(newSample);
		for (int k=0; k < model.num_classes(); k++) {
			probability_matrix(i,k) = prob[k];
		}
		
		results.push_back(newSample.label);
	}
	
	for (int k=0; k < model.num_classes(); k++) {
		probability_names.push_back(model.get_class_label(k));
	}
	
	List rs = List::create(results,probability_matrix,probability_names);
	
	return rs;
}
示例#9
0
int main(int argc, char** argv)
{
    /*
     * Params: model_file_name, input_file_name, output_file_name
     *
     */
    if (argc < 4)
    {
        exit_with_help();
    }

    string model_path  = argv[1];
    string input_path  = argv[2];
    string output_path = argv[3];

    ME_Model m;
    m.load(model_path);

    validate(m, input_path, output_path);

    return 0;
}
示例#10
0
int main(int argc, char *argv[]) {
    ME_Model model;
    int ncols = 7;
    char *train = argv[1];
    char *test = argv[2];
    int buffer = 300;
    char *buf = (char*) malloc( buffer * sizeof(char));
    FILE *fp;
    int count = 0;
    if ( ( fp = fopen( filename, "r" ) ) != NULL )
    {
        while ( fgets( buf, buffer, fp ) != NULL ) {
            double *spl = split(buf, '\t', ncols);
            for(unsigned char i=0; i<ncols; i++) {
                add_feat_to_model(model, spl);
            }
//            printf("%d processed\n", count);
            count++;
            free(spl);
            if(count>2000000) {
                break;
            }
        }                
        fclose( fp );
    }
    else
    {
        fprintf( stderr, "Error opening file %s\n", filename );
        free(buf);
        return 1;
    }
    printf("Start training\n");
    model.train();
    model.save_to_file("model");
    free(buf);
    return 0;
}    
示例#11
0
static void
decode_no_context(vector<Token> & vt, const ME_Model & me_none)
{
  int n = vt.size();
  if (n == 0) return;

  for (size_t i = 0; i < n; i++) {
    ME_Sample mes = mesample(vt, i, "", "", "", "");
    me_none.classify(mes);
    vt[i].prd = mes.label;
  }
  
  for (size_t k = 0; k < n; k++) {
    cout << vt[k].str << "/" << vt[k].prd << " ";
  }
  cout << endl;

}
示例#12
0
// Train model
RcppExport SEXP train_model(double l1=0, double l2=0, bool sgd=FALSE, int sgd_iter=30, double sgd_eta0=1, double sgd_alpha=0.85, int heldout=0) {
	Rprintf("Training the new model...\n");
	if (heldout > 0) model.set_heldout(heldout);
    if (l1 > 0) model.use_l1_regularizer(l1);
    else if (l2 > 0) model.use_l2_regularizer(l2);
    else if (sgd) model.use_SGD();
	
    model.train();
	
	string model_data = model.save_to_string();
	vector< vector<string> > weights = export_weights();
	List rs = List::create(model_data,weights[0],weights[1],weights[2]);
	
	return rs;
}
示例#13
0
void validate(const ME_Model & model, 
              const string & input_file_name,
              const string & output_file_name)
{
    ifstream ifile(input_file_name.c_str());
    ofstream ofile(output_file_name.c_str());

    if (!ifile)
    {
        cerr << "error: cannot open " << input_file_name << endl;
        exit(1);
    }
    if (!ofile)
    {
        cerr << "error: cannot open " << output_file_name << endl;
        exit(1);
    }

    int n_correct = 0;
    int n_total   = 0;

    string line;
    while (getline(ifile, line))
    {
        vector<string> vs = read_line(line);
        ME_Sample mes = sample(vs);
        model.predict(mes);

        ofile << mes.label << endl;

        if (mes.label == vs[0])  n_correct++;
        n_total++;
    }

    double accuracy = (double)n_correct / n_total;
    cout << "accuracy = " << n_correct << " / " << n_total
         << " = " << accuracy << endl;
}
示例#14
0
void test(const ME_Model & model, const string & filename) 
{
  ifstream ifile(filename.c_str());
  
  if (!ifile) {
    cerr << "error: cannot open " << filename << endl; 
    exit(1); 
  }

  int num_correct = 0;
  int num_tokens = 0;
  string line;
  while (getline(ifile, line)) {
    vector<Token> vs = read_line(line);
    for (int j = 0; j < (int)vs.size(); j++) {
      ME_Sample mes = sample(vs, j);
      model.classify(mes);
      if (mes.label == vs[j].pos) num_correct++;
      num_tokens++;
    }
  }    
  cout << "accuracy = " << num_correct << " / " << num_tokens << " = " 
       << (double)num_correct / num_tokens << endl;
}
示例#15
0
int main(int argc, char* argv[])
{
	if (argc < 3 || argc > 4) {
		cerr << "Usage: " << argv[0] << "input output [path-to-ruby]" << endl;
		exit(1);
	}

    ME_Model model;

    string inFile = argv[1];
    string outFile = argv[2];
    //string modelFile = argv[3];
    string modelFile = "model1-1.0";
	string rubyCommand = (argc == 4) ? argv[3] : "ruby";

	string eventFile = inFile + ".event";
	string resultFile = inFile + ".result";

    cerr << "Extracting events.";

	string extractionCommand = 
    	rubyCommand + " EventExtracter.rb " + inFile + " " + eventFile;
    system(extractionCommand.c_str());

    cerr << "roading model file." << endl;
    model.load_from_file(modelFile.c_str());
    //model.load_from_file("model" + setID + "-" + ineq);
    //ifstream fileIn(string("/home/users/y-matsu/private/workspace/eclipse-workspace/GENIASS/" + setID + "/test.txt").c_str());
    //ofstream fileOut(string("/home/users/y-matsu/private/workspace/eclipse-workspace/GENIASS/" + setID + "/test-" + ineq + ".prob").c_str());

    ifstream fileIn(eventFile.c_str());
    ofstream fileOut(resultFile.c_str());

    string line, markedTxt;

    getline(fileIn, markedTxt);
    cerr << "start classification." << endl;
    while (getline(fileIn, line)){
        vector<string> tokens;
        split(line, tokens);
        ME_Sample s;

        for(vector<string>::const_iterator token = tokens.begin() + 1;
				token != tokens.end(); ++token){
            s.add_feature(*token);
        }

        (void) model.classify(s);
        fileOut << s.label << endl;
    }
    fileOut.close();
    fileIn.close();

    remove(eventFile.c_str());

	string splitCommand =
    	rubyCommand + " Classifying2Splitting.rb "
		+ resultFile + " " + markedTxt + " " + outFile;

    system(splitCommand.c_str());

	return 0;
}
示例#16
0
void
viterbi(vector<Token> & vt, const ME_Model & me)
{
  if (vt.size() == 0) return;
  
  vector< vector<double> > mat;
  vector< vector<int> > bpm;
    
  vector<double> vd(me.num_classes());
  for (size_t j = 0; j < vd.size(); j++) vd[j] = 0;

  mat.push_back(vd);

  for (size_t i = 0; i < vt.size(); i++) {

    vector<double> vd(me.num_classes());
    for (size_t j = 0; j < vd.size(); j++) vd[j] = -999999;
    vector<int> bp(me.num_classes());

    double maxl = -999999;
    for (size_t j = 0; j < vd.size(); j++) {
      if (mat[i][j] > maxl) maxl = mat[i][j];
    }
    
    for (size_t j = 0; j < vd.size(); j++) {
      if (mat[i][j] < maxl - BEAM_WIDTH) continue; // beam thresholding
      
      string prepos = me.get_class_label(j);
      if (i == 0) {
        if (j > 0) continue;
        prepos = "BOS";
      }
      //      prepos = me.get_class_name(j);
      //      if (i == 0 && prepos != "BOS") continue;

      ME_Sample mes = mesample(vt, i, prepos);
      vector<double> membp = me.classify(mes);
      for (size_t k = 0; k < vd.size(); k++) {
        double l = mat[i][j] + log(membp[k]);
        if (l > vd[k]) {
          bp[k] = j;
          vd[k] = l;
        }
      }
    }
    mat.push_back(vd);
    //    for (int k = 0; k < vd.size(); k++) cout << bp[k] << " ";
    //    cout << endl;
    bpm.push_back(bp);
  }
  /*
  for (int i = 0; i < vt.size(); i++) {
    int max_prd = 0;
    for (int j = 0; j < vd.size(); j++) {
      double l = mat[i+1][j];
      if (l > mat[i+1][max_prd]) {
        max_prd = j;
      }
    }
    vt[i].prd = me.get_class_name(max_prd);
  }
  */  

  //  cout << "viterbi ";
  int max_prd = 0;
  int n = vt.size();
  for (size_t j = 0; j < vd.size(); j++) {
    double l = mat[n][j];
    if (l > mat[n][max_prd]) {
      max_prd = j;
    }
  }
  vt[n-1].prd = me.get_class_label(max_prd);
  for (int i = vt.size() - 2; i >= 0; i--) {
    //    cout << max_prd << " ";
    //    cerr << max_prd << " ";
    if (max_prd < 0 || max_prd >= me.num_classes()) exit(0);
    max_prd = bpm[i+1][max_prd];
    vt[i].prd = me.get_class_label(max_prd);
  }
  //  cout << endl;

}
示例#17
0
// New model
void new_model() {
    model.clear();
	model = *(new ME_Model());
}