Exemplo n.º 1
0
int PartialSegmentationUtils::split_by_partial_tag(
    const std::string& line,
    std::vector<std::string>& words) {
  size_t offset1 = line.find(__partial_start__);
  size_t offset2 = line.find(__word_start__);

  if (offset1 == std::string::npos && offset2 == std::string::npos) {
    // 0 representing no partial tags. split with the original tags.
    words = split(line);
    return 0;
  }

  size_t offset = 0;
  size_t prelude = 0, coda = 0, len_start_tag = 0, len_end_tag = 0;

  while (offset < line.length()) {
    prelude = (offset1 < offset2 ? offset1 : offset2);
    std::string word = line.substr(offset, prelude - offset);
    if (word.length() > 0) { words.push_back( word ); }

    if (offset1 < offset2) {
      coda = line.find(__partial_end__, prelude);
      len_start_tag = __partial_start__.length();
      len_end_tag   = __partial_end__.length();
    } else {
      coda = line.find(__word_end__, prelude);
      len_start_tag = __word_start__.length();
      len_end_tag   = __word_end__.length();
    }

    word = line.substr(prelude + len_start_tag, coda - prelude - len_start_tag);

    if ((word.find(__partial_start__) != std::string::npos) ||
        (word.find(__partial_end__)   != std::string::npos) ||
        (word.find(__word_start__)    != std::string::npos) ||
        (word.find(__word_end__)      != std::string::npos)) {
      return -1;
    }

    words.push_back( line.substr(prelude, coda - prelude + len_end_tag) );
    offset = coda + len_end_tag;

    offset1 = line.find(__partial_start__, offset);
    offset2 = line.find(__word_start__, offset);

    if (offset1 == std::string::npos && offset2 == std::string::npos) {
      // 0 representing no partial tags.
      word = line.substr(offset);
      if (word.length() > 0) { words.push_back( word ); }
      break;
    }
  }
  return 1;
}
Exemplo n.º 2
0
Arquivo: io.cpp Projeto: 153370771/ltp
Instance* PostaggerReader::next() {
  if (is.eof()) {
    return 0;
  }

  cursor ++;
  if (trace && cursor % interval == 0) {
    INFO_LOG("reading: read %d0%% instances.", (cursor/ interval));
  }
  Instance* inst = new Instance;
  std::string line;

  std::getline(is, line);
  trim(line);

  if (line.size() == 0) {
    delete inst;
    return 0;
  }

  std::vector<std::string> words = split(line);
  for (size_t i = 0; i < words.size(); ++ i) {
    if (with_tag) {
      std::vector<std::string> sep = rsplit_by_sep(words[i], delimiter, 1);
      if (sep.size() == 2) {
        inst->raw_forms.push_back(sep[0]);
        inst->forms.push_back(sbc2dbc_x(sep[0]));
        inst->tags.push_back(sep[1]);
      } else {
        std::cerr << words[i] << std::endl;
        delete inst;
        return 0;
      }
    } else {
      inst->raw_forms.push_back(words[i]);
      inst->forms.push_back(sbc2dbc_x(words[i]));
    }
  }
  return inst;
}
Exemplo n.º 3
0
Instance* CoNLLReader::next() {
  if (is.eof()) { return NULL;}

  Instance* inst = new Instance;
  std::string line;

  inst->raw_forms.push_back( SpecialOption::ROOT );
  inst->forms.push_back( SpecialOption::ROOT );
  inst->lemmas.push_back( SpecialOption::ROOT );
  inst->postags.push_back( SpecialOption::ROOT );
  inst->heads.push_back( -1 );
  inst->deprels.push_back( SpecialOption::ROOT );

  while (!is.eof()) {
    getline(is, line);
    trim(line);

    if (line.size() == 0) { break; }
    std::vector<std::string> items = split(line);

    if (items.size() < 8) {
      WARNING_LOG("Unknown conll format file");
    }

    inst->raw_forms.push_back( items[1] );  // items[1]: form
    inst->forms.push_back( sbc2dbc(items[1]) );
    inst->lemmas.push_back( items[2] );     // items[2]: lemma
    inst->postags.push_back( items[3] );    // items[4]: postag
    inst->heads.push_back( to_int(items[6]) );
    inst->deprels.push_back( items[7] );
  }

  if (inst->forms.size() == 1) {
    delete inst;
    inst = NULL;
  }

  return inst;
}