void train(ME_Model & model, const string & filename) { ifstream ifile(filename.c_str()); if (!ifile) { cerr << "error: cannot open " << filename << endl; exit(1); } string line; int n = 0; while (getline(ifile, line)) { vector<Token> vs = read_line(line); for (int j = 0; j < (int)vs.size(); j++) { ME_Sample mes = sample(vs, j); model.add_training_sample(mes); } if (n++ > 10000) break; } model.use_l1_regularizer(1.0); // model.use_l2_regularizer(1.0); // model.use_SGD(); model.set_heldout(100); model.train(); model.save_to_file("model"); }
int bidir_train(const vector<Sentence> & vs, int para) { // vme.clear(); // vme.resize(16); for (int t = 0; t < 16; t++) { if (t != 15 && t != 0) continue; // for (int t = 15; t >= 0; t--) { vector<ME_Sample> train; if (para != -1 && t % 4 != para) continue; // if (t % 2 == 1) continue; cerr << "type = " << t << endl; cerr << "extracting features..."; int n = 0; for (vector<Sentence>::const_iterator i = vs.begin(); i != vs.end(); i++) { const Sentence & s = *i; for (int j = 0; j < s.size(); j++) { string pos_left1 = "BOS", pos_left2 = "BOS2"; if (j >= 1) pos_left1 = s[j-1].pos; if (j >= 2) pos_left2 = s[j-2].pos; string pos_right1 = "EOS", pos_right2 = "EOS2"; if (j <= int(s.size()) - 2) pos_right1 = s[j+1].pos; if (j <= int(s.size()) - 3) pos_right2 = s[j+2].pos; if ( (t & 0x8) == 0 ) pos_left2 = ""; if ( (t & 0x4) == 0 ) pos_left1 = ""; if ( (t & 0x2) == 0 ) pos_right1 = ""; if ( (t & 0x1) == 0 ) pos_right2 = ""; train.push_back(mesample(s, j, pos_left2, pos_left1, pos_right1, pos_right2)); } // if (n++ > 1000) break; } cerr << "done" << endl; ME_Model m; // m.set_heldout(1000,0); // m.train(train, 2, 1000, 0); m.train(train, 2, 0, 1); char buf[1000]; sprintf(buf, "model.bidir.%d", t); m.save_to_file(buf); } }
// Print weights void print_weights() { list< pair< pair<string, string>, double > > fl; model.get_features(fl); for (list< pair< pair<string, string>, double> >::const_iterator i = fl.begin(); i != fl.end(); i++) { Rprintf("%10.3f %-10s %s\n", i->second, i->first.first.c_str(), i->first.second.c_str()); } }
// Export weights vector< vector<string> > export_weights() { list< pair< pair<string, string>, double > > fl; model.get_features(fl); vector<string> value1; vector<string> value2; vector<string> value3; for (list< pair< pair<string, string>, double> >::const_iterator i = fl.begin(); i != fl.end(); i++) { stringstream write_weights1; write_weights1 << setprecision(3) << setw(10) << i->second; string weights1 = write_weights1.str(); value1.push_back(weights1); stringstream write_weights2; write_weights2 << left << setw(10) << i->first.first.c_str(); string weights2 = write_weights2.str(); value2.push_back(weights2); stringstream write_weights3; write_weights3 << i->first.second.c_str(); string weights3 = write_weights3.str(); value3.push_back(weights3); } vector< vector<string> > results; results.push_back(value1); results.push_back(value2); results.push_back(value3); return results; }
void add_feat_to_model(ME_Model & model, double *feat, int nfeat = 7) { char *label = label_sample(feat); ME_Sample samp(label); for(int i=0; i<nfeat; i++) { char *mark = (char*) malloc(10 * sizeof(char)); sprintf(mark,"feat%d",i); samp.add_feature(mark,feat[i]); } model.add_training_sample(samp); }
void train(ME_Model & model, const string & input, const string & model_path) { ifstream ifile(input.c_str()); if (!ifile) { cerr << "error: cannot open " << input << endl; exit(1); } string line; while (getline(ifile, line)) { vector<string> vs = split(line); ME_Sample mes(vs, true); model.add_training_sample(mes); } model.train(); model.save(model_path); }
void add_samples(int nrows, int ncols, vector<string> samples, vector<double> ia, vector<string> ja, vector<double> ra) { new_model(); for (int i=0; i < nrows; i++) { // for each document //Rprintf("Document %d\n",i); // debug output ME_Sample newSample(samples[i]); // create new sample for code for (int j=ia[i]-1; j < ia[i+1]-1; j++) { // for each feature //Rprintf("Feature %s\n",features[j].c_str()); newSample.add_feature(ja[j],ra[j]); } model.add_training_sample(newSample); } }
RcppExport SEXP classify_samples(int nrows, int ncols, vector<double> ia, vector<string> ja, vector<double> ra, string model_data) { new_model(); model.load_from_string(model_data); vector<string> results; vector<string> probability_names; NumericMatrix probability_matrix(nrows,model.num_classes()); for (int i=0; i < nrows; i++) { // for each document //Rprintf("Document %d\n",i); // debug output ME_Sample newSample; // create new sample for code for (int j=ia[i]-1; j < ia[i+1]-1; j++) { // for each feature newSample.add_feature(ja[j],ra[j]); } vector<double> prob = model.classify(newSample); for (int k=0; k < model.num_classes(); k++) { probability_matrix(i,k) = prob[k]; } results.push_back(newSample.label); } for (int k=0; k < model.num_classes(); k++) { probability_names.push_back(model.get_class_label(k)); } List rs = List::create(results,probability_matrix,probability_names); return rs; }
int main(int argc, char** argv) { /* * Params: model_file_name, input_file_name, output_file_name * */ if (argc < 4) { exit_with_help(); } string model_path = argv[1]; string input_path = argv[2]; string output_path = argv[3]; ME_Model m; m.load(model_path); validate(m, input_path, output_path); return 0; }
int main(int argc, char *argv[]) { ME_Model model; int ncols = 7; char *train = argv[1]; char *test = argv[2]; int buffer = 300; char *buf = (char*) malloc( buffer * sizeof(char)); FILE *fp; int count = 0; if ( ( fp = fopen( filename, "r" ) ) != NULL ) { while ( fgets( buf, buffer, fp ) != NULL ) { double *spl = split(buf, '\t', ncols); for(unsigned char i=0; i<ncols; i++) { add_feat_to_model(model, spl); } // printf("%d processed\n", count); count++; free(spl); if(count>2000000) { break; } } fclose( fp ); } else { fprintf( stderr, "Error opening file %s\n", filename ); free(buf); return 1; } printf("Start training\n"); model.train(); model.save_to_file("model"); free(buf); return 0; }
static void decode_no_context(vector<Token> & vt, const ME_Model & me_none) { int n = vt.size(); if (n == 0) return; for (size_t i = 0; i < n; i++) { ME_Sample mes = mesample(vt, i, "", "", "", ""); me_none.classify(mes); vt[i].prd = mes.label; } for (size_t k = 0; k < n; k++) { cout << vt[k].str << "/" << vt[k].prd << " "; } cout << endl; }
// Train model RcppExport SEXP train_model(double l1=0, double l2=0, bool sgd=FALSE, int sgd_iter=30, double sgd_eta0=1, double sgd_alpha=0.85, int heldout=0) { Rprintf("Training the new model...\n"); if (heldout > 0) model.set_heldout(heldout); if (l1 > 0) model.use_l1_regularizer(l1); else if (l2 > 0) model.use_l2_regularizer(l2); else if (sgd) model.use_SGD(); model.train(); string model_data = model.save_to_string(); vector< vector<string> > weights = export_weights(); List rs = List::create(model_data,weights[0],weights[1],weights[2]); return rs; }
void validate(const ME_Model & model, const string & input_file_name, const string & output_file_name) { ifstream ifile(input_file_name.c_str()); ofstream ofile(output_file_name.c_str()); if (!ifile) { cerr << "error: cannot open " << input_file_name << endl; exit(1); } if (!ofile) { cerr << "error: cannot open " << output_file_name << endl; exit(1); } int n_correct = 0; int n_total = 0; string line; while (getline(ifile, line)) { vector<string> vs = read_line(line); ME_Sample mes = sample(vs); model.predict(mes); ofile << mes.label << endl; if (mes.label == vs[0]) n_correct++; n_total++; } double accuracy = (double)n_correct / n_total; cout << "accuracy = " << n_correct << " / " << n_total << " = " << accuracy << endl; }
void test(const ME_Model & model, const string & filename) { ifstream ifile(filename.c_str()); if (!ifile) { cerr << "error: cannot open " << filename << endl; exit(1); } int num_correct = 0; int num_tokens = 0; string line; while (getline(ifile, line)) { vector<Token> vs = read_line(line); for (int j = 0; j < (int)vs.size(); j++) { ME_Sample mes = sample(vs, j); model.classify(mes); if (mes.label == vs[j].pos) num_correct++; num_tokens++; } } cout << "accuracy = " << num_correct << " / " << num_tokens << " = " << (double)num_correct / num_tokens << endl; }
int main(int argc, char* argv[]) { if (argc < 3 || argc > 4) { cerr << "Usage: " << argv[0] << "input output [path-to-ruby]" << endl; exit(1); } ME_Model model; string inFile = argv[1]; string outFile = argv[2]; //string modelFile = argv[3]; string modelFile = "model1-1.0"; string rubyCommand = (argc == 4) ? argv[3] : "ruby"; string eventFile = inFile + ".event"; string resultFile = inFile + ".result"; cerr << "Extracting events."; string extractionCommand = rubyCommand + " EventExtracter.rb " + inFile + " " + eventFile; system(extractionCommand.c_str()); cerr << "roading model file." << endl; model.load_from_file(modelFile.c_str()); //model.load_from_file("model" + setID + "-" + ineq); //ifstream fileIn(string("/home/users/y-matsu/private/workspace/eclipse-workspace/GENIASS/" + setID + "/test.txt").c_str()); //ofstream fileOut(string("/home/users/y-matsu/private/workspace/eclipse-workspace/GENIASS/" + setID + "/test-" + ineq + ".prob").c_str()); ifstream fileIn(eventFile.c_str()); ofstream fileOut(resultFile.c_str()); string line, markedTxt; getline(fileIn, markedTxt); cerr << "start classification." << endl; while (getline(fileIn, line)){ vector<string> tokens; split(line, tokens); ME_Sample s; for(vector<string>::const_iterator token = tokens.begin() + 1; token != tokens.end(); ++token){ s.add_feature(*token); } (void) model.classify(s); fileOut << s.label << endl; } fileOut.close(); fileIn.close(); remove(eventFile.c_str()); string splitCommand = rubyCommand + " Classifying2Splitting.rb " + resultFile + " " + markedTxt + " " + outFile; system(splitCommand.c_str()); return 0; }
void viterbi(vector<Token> & vt, const ME_Model & me) { if (vt.size() == 0) return; vector< vector<double> > mat; vector< vector<int> > bpm; vector<double> vd(me.num_classes()); for (size_t j = 0; j < vd.size(); j++) vd[j] = 0; mat.push_back(vd); for (size_t i = 0; i < vt.size(); i++) { vector<double> vd(me.num_classes()); for (size_t j = 0; j < vd.size(); j++) vd[j] = -999999; vector<int> bp(me.num_classes()); double maxl = -999999; for (size_t j = 0; j < vd.size(); j++) { if (mat[i][j] > maxl) maxl = mat[i][j]; } for (size_t j = 0; j < vd.size(); j++) { if (mat[i][j] < maxl - BEAM_WIDTH) continue; // beam thresholding string prepos = me.get_class_label(j); if (i == 0) { if (j > 0) continue; prepos = "BOS"; } // prepos = me.get_class_name(j); // if (i == 0 && prepos != "BOS") continue; ME_Sample mes = mesample(vt, i, prepos); vector<double> membp = me.classify(mes); for (size_t k = 0; k < vd.size(); k++) { double l = mat[i][j] + log(membp[k]); if (l > vd[k]) { bp[k] = j; vd[k] = l; } } } mat.push_back(vd); // for (int k = 0; k < vd.size(); k++) cout << bp[k] << " "; // cout << endl; bpm.push_back(bp); } /* for (int i = 0; i < vt.size(); i++) { int max_prd = 0; for (int j = 0; j < vd.size(); j++) { double l = mat[i+1][j]; if (l > mat[i+1][max_prd]) { max_prd = j; } } vt[i].prd = me.get_class_name(max_prd); } */ // cout << "viterbi "; int max_prd = 0; int n = vt.size(); for (size_t j = 0; j < vd.size(); j++) { double l = mat[n][j]; if (l > mat[n][max_prd]) { max_prd = j; } } vt[n-1].prd = me.get_class_label(max_prd); for (int i = vt.size() - 2; i >= 0; i--) { // cout << max_prd << " "; // cerr << max_prd << " "; if (max_prd < 0 || max_prd >= me.num_classes()) exit(0); max_prd = bpm[i+1][max_prd]; vt[i].prd = me.get_class_label(max_prd); } // cout << endl; }
// New model void new_model() { model.clear(); model = *(new ME_Model()); }