void CoNLLWriter::write(const Instance& inst) { size_t len = inst.size(); bool predicted = (inst.predict_heads.size() == len && inst.predict_deprels.size() == len); for (size_t i = 1; i < len; ++ i) { f << i << "\t" // 0 - index << inst.raw_forms[i] << "\t" // 1 - form << inst.lemmas[i] << "\t" // 2 - lemma << inst.postags[i] << "\t" // 3 - postag << "_\t" // 4 - unknown << "_\t" // 5 - unknown << inst.heads[i] << "\t" // 6 - heads << inst.deprels[i] << "\t" // 7 - deprels << (predicted ? to_str(inst.predict_heads[i]) : "_") << "\t" << (predicted ? inst.predict_deprels[i] : "_") << std::endl; } f << std::endl; }
void PostaggerFrontend::train(void) { // read in training instance INFO_LOG("trace: reading reference dataset ..."); if (!read_instances(train_opt.train_file.c_str())) { ERROR_LOG("Training file doesn't exist."); } INFO_LOG("trace: %d sentences is loaded.", train_dat.size()); model = new Model(Extractor::num_templates()); // build tag dictionary, map string tag to index INFO_LOG("report: start building configuration ..."); build_configuration(); INFO_LOG("report: build configuration is done."); INFO_LOG("report: number of postags: %d", model->labels.size()); // build feature space from the training instance INFO_LOG("report: start building feature space ..."); build_feature_space(); INFO_LOG("report: building feature space is done."); INFO_LOG("report: number of features: %d", model->space.num_features()); model->param.realloc(model->space.dim()); INFO_LOG("report: allocate %d dimensition parameter.", model->space.dim()); int nr_groups = model->space.num_groups(); std::vector<int> groupwise_update_counters; if (train_opt.rare_feature_threshold > 0) { groupwise_update_counters.resize(nr_groups, 0); INFO_LOG("report: allocate %d update-time counters", nr_groups); } else { INFO_LOG("report: model truncation is inactived."); } int best_iteration = -1; double best_p = -1.; std::vector<size_t> update_counts; for (int iter = 0; iter < train_opt.max_iter; ++ iter) { INFO_LOG("Training iteraition #%d", (iter + 1)); size_t interval= train_dat.size() / 10; for (size_t i = 0; i < train_dat.size(); ++ i) { Instance* inst = train_dat[i]; extract_features((*inst), &ctx, false); calculate_scores((*inst), ctx, false, &scm); decoder.decode(scm, inst->predict_tagsidx); collect_features(model, ctx.uni_features, inst->tagsidx, ctx.correct_features); collect_features(model, ctx.uni_features, inst->predict_tagsidx, ctx.predict_features); SparseVec updated_features; updated_features.add(ctx.correct_features, 1.); updated_features.add(ctx.predict_features, -1.); learn(train_opt.algorithm, updated_features, iter*train_dat.size() + 1, inst->num_errors(), model); if (train_opt.rare_feature_threshold > 0) { increase_groupwise_update_counts(model, updated_features, update_counts); } ctx.clear(); if ((i+1) % interval == 0) { INFO_LOG("training: %d0%% (%d) instances is trained.", ((i+1)/interval), i+1); } } INFO_LOG("trace: %d instances is trained.", train_dat.size()); model->param.flush( train_dat.size() * (iter + 1) ); Model* new_model = new Model(Extractor::num_templates()); erase_rare_features(model, new_model, train_opt.rare_feature_threshold, update_counts); std::swap(model, new_model); double p; evaluate(p); if(p > best_p){ best_p = p; best_iteration = iter; } std::string saved_model_file = (train_opt.model_name+ "."+ to_str(iter)); std::ofstream ofs(saved_model_file.c_str(), std::ofstream::binary); std::swap(model, new_model); new_model->save(model_header, Parameters::kDumpAveraged, ofs); delete new_model; INFO_LOG("trace: model for iteration #%d is saved to %s", iter+1, saved_model_file.c_str()); } INFO_LOG("Best result (iteration = %d) : P = %lf", best_iteration, best_p); }