Ejemplo n.º 1
0
void PartialSegmentationUtils::trim_partial_tag(const std::string& input, std::string& output) {
  if (startswith(input, __word_start__)) {
    output = input.substr(__word_start__.length(),
        input.length() - __word_start__.length() - __word_end__.length());
  } else if (startswith(input, __partial_start__)) {
    output = input.substr(__partial_start__.length(),
        input.length() - __partial_start__.length() - __partial_end__.length());
  } else {
    output = input;
  }
}
Ejemplo n.º 2
0
NERTransitionConstrain::NERTransitionConstrain(const utility::IndexableSmartMap& alphabet,
    const std::vector<std::string>& includes): T(alphabet.size()) {
  for (size_t i = 0; i < includes.size(); ++ i) {
    const std::string& include = includes[i];
    std::vector<std::string> tokens = split_by_sep(include, "->", 1);
    if (tokens.size() != 2) {
      WARNING_LOG("constrain text \"%s\" is in illegal format.", include.c_str());
      continue;
    }

    int from = alphabet.index(trim_copy(tokens[0]));
    int to = alphabet.index(trim_copy(tokens[1]));
    if (-1 == from || -1 == to) {
      WARNING_LOG("label in constrain text \"%s,%s\" is not in alphabet.",
          trim_copy(tokens[0]).c_str(), trim_copy(tokens[1]).c_str());
    } else {
      rep.insert(from * T + to);
    }
  }
}
Ejemplo n.º 3
0
int PartialSegmentationUtils::split_by_partial_tag(
    const std::string& line,
    std::vector<std::string>& words) {
  size_t offset1 = line.find(__partial_start__);
  size_t offset2 = line.find(__word_start__);

  if (offset1 == std::string::npos && offset2 == std::string::npos) {
    // 0 representing no partial tags. split with the original tags.
    words = split(line);
    return 0;
  }

  size_t offset = 0;
  size_t prelude = 0, coda = 0, len_start_tag = 0, len_end_tag = 0;

  while (offset < line.length()) {
    prelude = (offset1 < offset2 ? offset1 : offset2);
    std::string word = line.substr(offset, prelude - offset);
    if (word.length() > 0) { words.push_back( word ); }

    if (offset1 < offset2) {
      coda = line.find(__partial_end__, prelude);
      len_start_tag = __partial_start__.length();
      len_end_tag   = __partial_end__.length();
    } else {
      coda = line.find(__word_end__, prelude);
      len_start_tag = __word_start__.length();
      len_end_tag   = __word_end__.length();
    }

    word = line.substr(prelude + len_start_tag, coda - prelude - len_start_tag);

    if ((word.find(__partial_start__) != std::string::npos) ||
        (word.find(__partial_end__)   != std::string::npos) ||
        (word.find(__word_start__)    != std::string::npos) ||
        (word.find(__word_end__)      != std::string::npos)) {
      return -1;
    }

    words.push_back( line.substr(prelude, coda - prelude + len_end_tag) );
    offset = coda + len_end_tag;

    offset1 = line.find(__partial_start__, offset);
    offset2 = line.find(__word_start__, offset);

    if (offset1 == std::string::npos && offset2 == std::string::npos) {
      // 0 representing no partial tags.
      word = line.substr(offset);
      if (word.length() > 0) { words.push_back( word ); }
      break;
    }
  }
  return 1;
}
Ejemplo n.º 4
0
Archivo: io.cpp Proyecto: 153370771/ltp
Instance* PostaggerReader::next() {
  if (is.eof()) {
    return 0;
  }

  cursor ++;
  if (trace && cursor % interval == 0) {
    INFO_LOG("reading: read %d0%% instances.", (cursor/ interval));
  }
  Instance* inst = new Instance;
  std::string line;

  std::getline(is, line);
  trim(line);

  if (line.size() == 0) {
    delete inst;
    return 0;
  }

  std::vector<std::string> words = split(line);
  for (size_t i = 0; i < words.size(); ++ i) {
    if (with_tag) {
      std::vector<std::string> sep = rsplit_by_sep(words[i], delimiter, 1);
      if (sep.size() == 2) {
        inst->raw_forms.push_back(sep[0]);
        inst->forms.push_back(sbc2dbc_x(sep[0]));
        inst->tags.push_back(sep[1]);
      } else {
        std::cerr << words[i] << std::endl;
        delete inst;
        return 0;
      }
    } else {
      inst->raw_forms.push_back(words[i]);
      inst->forms.push_back(sbc2dbc_x(words[i]));
    }
  }
  return inst;
}
Ejemplo n.º 5
0
Instance* CoNLLReader::next() {
  if (is.eof()) { return NULL;}

  Instance* inst = new Instance;
  std::string line;

  inst->raw_forms.push_back( SpecialOption::ROOT );
  inst->forms.push_back( SpecialOption::ROOT );
  inst->lemmas.push_back( SpecialOption::ROOT );
  inst->postags.push_back( SpecialOption::ROOT );
  inst->heads.push_back( -1 );
  inst->deprels.push_back( SpecialOption::ROOT );

  while (!is.eof()) {
    getline(is, line);
    trim(line);

    if (line.size() == 0) { break; }
    std::vector<std::string> items = split(line);

    if (items.size() < 8) {
      WARNING_LOG("Unknown conll format file");
    }

    inst->raw_forms.push_back( items[1] );  // items[1]: form
    inst->forms.push_back( sbc2dbc(items[1]) );
    inst->lemmas.push_back( items[2] );     // items[2]: lemma
    inst->postags.push_back( items[3] );    // items[4]: postag
    inst->heads.push_back( to_int(items[6]) );
    inst->deprels.push_back( items[7] );
  }

  if (inst->forms.size() == 1) {
    delete inst;
    inst = NULL;
  }

  return inst;
}
Ejemplo n.º 6
0
void CoNLLWriter::write(const Instance& inst) {
  size_t len = inst.size();
  bool predicted = (inst.predict_heads.size() == len &&
      inst.predict_deprels.size() == len);

  for (size_t i = 1; i < len; ++ i) {
    f << i << "\t"                  // 0 - index
      << inst.raw_forms[i]   << "\t"   // 1 - form
      << inst.lemmas[i]  << "\t"   // 2 - lemma
      << inst.postags[i] << "\t"   // 3 - postag
      << "_\t"   // 4 - unknown
      << "_\t"   // 5 - unknown
      << inst.heads[i]   << "\t"   // 6 - heads
      << inst.deprels[i] << "\t"   // 7 - deprels
      << (predicted ? to_str(inst.predict_heads[i]) : "_")
      << "\t"
      << (predicted ? inst.predict_deprels[i] : "_")
      << std::endl;
  }
  f << std::endl;
}
Ejemplo n.º 7
0
void PostaggerFrontend::train(void) {
  // read in training instance
  INFO_LOG("trace: reading reference dataset ...");
  if (!read_instances(train_opt.train_file.c_str())) {
    ERROR_LOG("Training file doesn't exist.");
  }
  INFO_LOG("trace: %d sentences is loaded.", train_dat.size());

  model = new Model(Extractor::num_templates());
  // build tag dictionary, map string tag to index
  INFO_LOG("report: start building configuration ...");
  build_configuration();
  INFO_LOG("report: build configuration is done.");
  INFO_LOG("report: number of postags: %d", model->labels.size());

  // build feature space from the training instance
  INFO_LOG("report: start building feature space ...");
  build_feature_space();
  INFO_LOG("report: building feature space is done.");
  INFO_LOG("report: number of features: %d", model->space.num_features());

  model->param.realloc(model->space.dim());
  INFO_LOG("report: allocate %d dimensition parameter.", model->space.dim());

  int nr_groups = model->space.num_groups();
  std::vector<int> groupwise_update_counters;

  if (train_opt.rare_feature_threshold > 0) {
    groupwise_update_counters.resize(nr_groups, 0);
    INFO_LOG("report: allocate %d update-time counters", nr_groups);
  } else {
    INFO_LOG("report: model truncation is inactived.");
  }

  int best_iteration = -1;
  double best_p = -1.;

  std::vector<size_t> update_counts;

  for (int iter = 0; iter < train_opt.max_iter; ++ iter) {
    INFO_LOG("Training iteraition #%d", (iter + 1));

    size_t interval= train_dat.size() / 10;
    for (size_t i = 0; i < train_dat.size(); ++ i) {
      Instance* inst = train_dat[i];
      extract_features((*inst), &ctx, false);
      calculate_scores((*inst), ctx, false, &scm);
      decoder.decode(scm, inst->predict_tagsidx);

      collect_features(model, ctx.uni_features, inst->tagsidx, ctx.correct_features);
      collect_features(model, ctx.uni_features, inst->predict_tagsidx, ctx.predict_features);

      SparseVec updated_features;
      updated_features.add(ctx.correct_features, 1.);
      updated_features.add(ctx.predict_features, -1.);

      learn(train_opt.algorithm, updated_features,
        iter*train_dat.size() + 1, inst->num_errors(), model);


      if (train_opt.rare_feature_threshold > 0) {
        increase_groupwise_update_counts(model, updated_features, update_counts);
      }

      ctx.clear();
      if ((i+1) % interval == 0) {
        INFO_LOG("training: %d0%% (%d) instances is trained.", ((i+1)/interval), i+1);
      }
    }
    INFO_LOG("trace: %d instances is trained.", train_dat.size());
    model->param.flush( train_dat.size() * (iter + 1) );

    Model* new_model = new Model(Extractor::num_templates());
    erase_rare_features(model, new_model, train_opt.rare_feature_threshold,
        update_counts);

    std::swap(model, new_model);
    double p;
    evaluate(p);

    if(p > best_p){
      best_p = p;
      best_iteration = iter;
    }

    std::string saved_model_file = (train_opt.model_name+ "."+ to_str(iter));
    std::ofstream ofs(saved_model_file.c_str(), std::ofstream::binary);
    std::swap(model, new_model);
    new_model->save(model_header, Parameters::kDumpAveraged, ofs);
    delete new_model;

    INFO_LOG("trace: model for iteration #%d is saved to %s", iter+1, saved_model_file.c_str());
  }

  INFO_LOG("Best result (iteration = %d) : P = %lf", best_iteration, best_p);
}
Ejemplo n.º 8
0
bool PartialSegmentationUtils::is_partial_tagged_word(const std::string& word) {
  return startswith(word, __word_start__);
}
Ejemplo n.º 9
0
Archivo: io.cpp Proyecto: 153370771/ltp
Instance* SegmentReader::next() {
  if (is.eof()) {
    return 0;
  }

  cursor ++;
  if (trace && cursor % interval == 0) {
    INFO_LOG("reading: read %d0%% instances.", (cursor/ interval));
  }

  Instance* inst = new Instance;
  std::string line;

  std::getline(is, line);
  trim(line);

  if (line.size() == 0) {
    delete inst;
    return 0;
  }

  if (segmented) {
    std::vector<std::string> words = strutils::split(line);
    inst->words = words;

    for (size_t i = 0; i < words.size(); ++ i) {
      // std::vector<std::string> chars;
      // int num_chars = codecs::decode(words[i], chars);
      int num_chars = preprocessor.preprocess(words[i], inst->raw_forms,
          inst->forms, inst->chartypes);

      if (num_chars < 0) {
        delete inst;
        return 0;
      }

      for (size_t j = 0; j < num_chars; ++ j) {
        // inst->forms.push_back(chars[j]);
        if (1 == num_chars) {
          inst->tags.push_back( __s__ );
        } else {
          if (0 == j) {
            inst->tags.push_back( __b__ );
          } else if (num_chars - 1 == j) {
            inst->tags.push_back( __e__ );
          } else {
            inst->tags.push_back( __i__ );
          }
        }
      }
    }
  } else {
    int ret = preprocessor.preprocess(line, inst->raw_forms,
        inst->forms, inst->chartypes);

    if (ret < 0) {
      delete inst;
      return 0;
    }
  }
  return inst;
}