Пример #1
0
// integrate word segmentor into LTP
int LTP::wordseg(XML4NLP & xml) {
  if (xml.QueryNote(NOTE_WORD)) {
    return 0;
  }

  //
  int ret = splitSentence_dummy(xml);
  if (0 != ret) {
    ERROR_LOG("in LTP::wordseg, failed to perform split sentence preprocess.");
    return ret;
  }

  // get the segmentor pointer
  void * segmentor = _resource.GetSegmentor();
  if (0 == segmentor) {
    ERROR_LOG("in LTP::wordseg, failed to init a segmentor");
    return kWordsegError;
  }

  int stnsNum = xml.CountSentenceInDocument();

  if (0 == stnsNum) {
    ERROR_LOG("in LTP::wordseg, number of sentence equals 0");
    return kEmptyStringError;
  }

  for (int i = 0; i < stnsNum; ++ i) {
    std::string strStn = xml.GetSentence(i);
    std::vector<std::string> vctWords;

    if (ltp::strutils::codecs::length(strStn) > MAX_SENTENCE_LEN) {
      ERROR_LOG("in LTP::wordseg, input sentence is too long");
      return kSentenceTooLongError;
    }

    if (0 == segmentor_segment(segmentor, strStn, vctWords)) {
      ERROR_LOG("in LTP::wordseg, failed to perform word segment on \"%s\"",
          strStn.c_str());
      return kWordsegError;
    }

    if (0 != xml.SetWordsToSentence(vctWords, i)) {
      ERROR_LOG("in LTP::wordseg, failed to write segment result to xml");
      return kWriteXmlError;
    }
  }

  xml.SetNote(NOTE_WORD);
  return 0;
}