Exemplo n.º 1
0
void CoNLLWriter::write(const Instance& inst) {
  size_t len = inst.size();
  bool predicted = (inst.predict_heads.size() == len &&
      inst.predict_deprels.size() == len);

  for (size_t i = 1; i < len; ++ i) {
    f << i << "\t"                  // 0 - index
      << inst.raw_forms[i]   << "\t"   // 1 - form
      << inst.lemmas[i]  << "\t"   // 2 - lemma
      << inst.postags[i] << "\t"   // 3 - postag
      << "_\t"   // 4 - unknown
      << "_\t"   // 5 - unknown
      << inst.heads[i]   << "\t"   // 6 - heads
      << inst.deprels[i] << "\t"   // 7 - deprels
      << (predicted ? to_str(inst.predict_heads[i]) : "_")
      << "\t"
      << (predicted ? inst.predict_deprels[i] : "_")
      << std::endl;
  }
  f << std::endl;
}
Exemplo n.º 2
0
void PostaggerFrontend::train(void) {
  // read in training instance
  INFO_LOG("trace: reading reference dataset ...");
  if (!read_instances(train_opt.train_file.c_str())) {
    ERROR_LOG("Training file doesn't exist.");
  }
  INFO_LOG("trace: %d sentences is loaded.", train_dat.size());

  model = new Model(Extractor::num_templates());
  // build tag dictionary, map string tag to index
  INFO_LOG("report: start building configuration ...");
  build_configuration();
  INFO_LOG("report: build configuration is done.");
  INFO_LOG("report: number of postags: %d", model->labels.size());

  // build feature space from the training instance
  INFO_LOG("report: start building feature space ...");
  build_feature_space();
  INFO_LOG("report: building feature space is done.");
  INFO_LOG("report: number of features: %d", model->space.num_features());

  model->param.realloc(model->space.dim());
  INFO_LOG("report: allocate %d dimensition parameter.", model->space.dim());

  int nr_groups = model->space.num_groups();
  std::vector<int> groupwise_update_counters;

  if (train_opt.rare_feature_threshold > 0) {
    groupwise_update_counters.resize(nr_groups, 0);
    INFO_LOG("report: allocate %d update-time counters", nr_groups);
  } else {
    INFO_LOG("report: model truncation is inactived.");
  }

  int best_iteration = -1;
  double best_p = -1.;

  std::vector<size_t> update_counts;

  for (int iter = 0; iter < train_opt.max_iter; ++ iter) {
    INFO_LOG("Training iteraition #%d", (iter + 1));

    size_t interval= train_dat.size() / 10;
    for (size_t i = 0; i < train_dat.size(); ++ i) {
      Instance* inst = train_dat[i];
      extract_features((*inst), &ctx, false);
      calculate_scores((*inst), ctx, false, &scm);
      decoder.decode(scm, inst->predict_tagsidx);

      collect_features(model, ctx.uni_features, inst->tagsidx, ctx.correct_features);
      collect_features(model, ctx.uni_features, inst->predict_tagsidx, ctx.predict_features);

      SparseVec updated_features;
      updated_features.add(ctx.correct_features, 1.);
      updated_features.add(ctx.predict_features, -1.);

      learn(train_opt.algorithm, updated_features,
        iter*train_dat.size() + 1, inst->num_errors(), model);


      if (train_opt.rare_feature_threshold > 0) {
        increase_groupwise_update_counts(model, updated_features, update_counts);
      }

      ctx.clear();
      if ((i+1) % interval == 0) {
        INFO_LOG("training: %d0%% (%d) instances is trained.", ((i+1)/interval), i+1);
      }
    }
    INFO_LOG("trace: %d instances is trained.", train_dat.size());
    model->param.flush( train_dat.size() * (iter + 1) );

    Model* new_model = new Model(Extractor::num_templates());
    erase_rare_features(model, new_model, train_opt.rare_feature_threshold,
        update_counts);

    std::swap(model, new_model);
    double p;
    evaluate(p);

    if(p > best_p){
      best_p = p;
      best_iteration = iter;
    }

    std::string saved_model_file = (train_opt.model_name+ "."+ to_str(iter));
    std::ofstream ofs(saved_model_file.c_str(), std::ofstream::binary);
    std::swap(model, new_model);
    new_model->save(model_header, Parameters::kDumpAveraged, ofs);
    delete new_model;

    INFO_LOG("trace: model for iteration #%d is saved to %s", iter+1, saved_model_file.c_str());
  }

  INFO_LOG("Best result (iteration = %d) : P = %lf", best_iteration, best_p);
}